|
|
|
@ -0,0 +1,48 @@
|
|
|
|
|
# pip install pymysql
|
|
|
|
|
# pip install requests beautifulsoup4
|
|
|
|
|
|
|
|
|
|
import pymysql
|
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
# 遍历 mysql数据库,然后开启爬虫
|
|
|
|
|
# 建立数据库连接
|
|
|
|
|
conn = pymysql.connect(
|
|
|
|
|
host='10.10.14.203', # 主机名(或IP地址)
|
|
|
|
|
port=3306, # 端口号,默认为3306
|
|
|
|
|
user='root', # 用户名
|
|
|
|
|
password='Password123@mysql', # 密码
|
|
|
|
|
charset='utf8mb4' # 设置字符编码
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 创建游标对象
|
|
|
|
|
cursor = conn.cursor()
|
|
|
|
|
# 选择数据库
|
|
|
|
|
conn.select_db("ds_db")
|
|
|
|
|
# 执行查询操作
|
|
|
|
|
cursor.execute(
|
|
|
|
|
"SELECT id,full_name FROM t_dm_area where province_id='FD61813E-70A1-42AB-9A8E-141ED4D47B98' order by level_id")
|
|
|
|
|
|
|
|
|
|
# 获取查询结果,返回元组
|
|
|
|
|
result: tuple = cursor.fetchall()
|
|
|
|
|
|
|
|
|
|
for e in result:
|
|
|
|
|
area_name = e[1]
|
|
|
|
|
url = "https://baike.baidu.com/item/" + area_name + "?fromModule=lemma_search-box"
|
|
|
|
|
|
|
|
|
|
# 发送HTTP GET请求
|
|
|
|
|
response = requests.get(url)
|
|
|
|
|
# 检查请求是否成功
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
# 使用BeautifulSoup解析HTML内容
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
# 假设我们要抓取的是<h1>标签中的文字
|
|
|
|
|
# 你可以根据需要修改选择器来抓取不同的内容
|
|
|
|
|
h1_text = soup.find('h1').text
|
|
|
|
|
print(h1_text) # 打印抓取的文字
|
|
|
|
|
else:
|
|
|
|
|
print('Failed to retrieve the webpage')
|
|
|
|
|
# 关闭游标和连接
|
|
|
|
|
cursor.close()
|
|
|
|
|
conn.close()
|