|
|
|
@ -1,9 +1,15 @@
|
|
|
|
|
# pip install pymysql
|
|
|
|
|
# pip install requests beautifulsoup4
|
|
|
|
|
|
|
|
|
|
# 查看结果
|
|
|
|
|
# select * from t_dm_area where province_id='FD61813E-70A1-42AB-9A8E-141ED4D47B98' order by level_id;
|
|
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
import pymysql
|
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
# 遍历 mysql数据库,然后开启爬虫
|
|
|
|
@ -28,9 +34,11 @@ if __name__ == '__main__':
|
|
|
|
|
result: tuple = cursor.fetchall()
|
|
|
|
|
|
|
|
|
|
for e in result:
|
|
|
|
|
id = e[0]
|
|
|
|
|
area_name = e[1]
|
|
|
|
|
url = "https://baike.baidu.com/item/" + area_name + "?fromModule=lemma_search-box"
|
|
|
|
|
|
|
|
|
|
print(url)
|
|
|
|
|
# 发送HTTP GET请求
|
|
|
|
|
response = requests.get(url)
|
|
|
|
|
# 检查请求是否成功
|
|
|
|
@ -39,10 +47,22 @@ if __name__ == '__main__':
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
# 假设我们要抓取的是<h1>标签中的文字
|
|
|
|
|
# 你可以根据需要修改选择器来抓取不同的内容
|
|
|
|
|
h1_text = soup.find('h1').text
|
|
|
|
|
print(h1_text) # 打印抓取的文字
|
|
|
|
|
specific_divs = soup.select('div.para_YYuCh.summary_nfAdr.MARK_MODULE')
|
|
|
|
|
# 遍历找到的所有特定div标签,并打印它们的文本内容
|
|
|
|
|
for div in specific_divs:
|
|
|
|
|
text = div.get_text(strip=True) # 使用get_text()方法获取文本,并去除
|
|
|
|
|
# 使用正则表达式移除所有形如[数字]和[数字-数字]的字符串
|
|
|
|
|
cleaned_text = re.sub(r'\[\d+(?:-\d+)?\]', '', text)
|
|
|
|
|
sql = "update t_dm_area set memo=%s where id=%s"
|
|
|
|
|
cursor.execute(sql, (cleaned_text, id))
|
|
|
|
|
conn.commit()
|
|
|
|
|
print("更新"+area_name+"数据成功")
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
print('Failed to retrieve the webpage')
|
|
|
|
|
|
|
|
|
|
time.sleep(2)
|
|
|
|
|
# 关闭游标和连接
|
|
|
|
|
cursor.close()
|
|
|
|
|
conn.close()
|
|
|
|
|
print("结束")
|