# pip install pymysql # pip install requests beautifulsoup4 # 查看结果 # select * from t_dm_area where province_id='FD61813E-70A1-42AB-9A8E-141ED4D47B98' order by level_id; import time import pymysql import requests from bs4 import BeautifulSoup import re if __name__ == '__main__': # 遍历 mysql数据库,然后开启爬虫 # 建立数据库连接 conn = pymysql.connect( host='10.10.14.203', # 主机名(或IP地址) port=3306, # 端口号,默认为3306 user='root', # 用户名 password='Password123@mysql', # 密码 charset='utf8mb4' # 设置字符编码 ) # 创建游标对象 cursor = conn.cursor() # 选择数据库 conn.select_db("ds_db") # 执行查询操作 cursor.execute( "SELECT id,full_name FROM t_dm_area where province_id='FD61813E-70A1-42AB-9A8E-141ED4D47B98' order by level_id") # 获取查询结果,返回元组 result: tuple = cursor.fetchall() for e in result: id = e[0] area_name = e[1] url = "https://baike.baidu.com/item/" + area_name + "?fromModule=lemma_search-box" print(url) # 发送HTTP GET请求 response = requests.get(url) # 检查请求是否成功 if response.status_code == 200: # 使用BeautifulSoup解析HTML内容 soup = BeautifulSoup(response.text, 'html.parser') # 假设我们要抓取的是

标签中的文字 # 你可以根据需要修改选择器来抓取不同的内容 specific_divs = soup.select('div.para_YYuCh.summary_nfAdr.MARK_MODULE') # 遍历找到的所有特定div标签,并打印它们的文本内容 for div in specific_divs: text = div.get_text(strip=True) # 使用get_text()方法获取文本,并去除 # 使用正则表达式移除所有形如[数字]和[数字-数字]的字符串 cleaned_text = re.sub(r'\[\d+(?:-\d+)?\]', '', text) sql = "update t_dm_area set memo=%s where id=%s" cursor.execute(sql, (cleaned_text, id)) conn.commit() print("更新"+area_name+"数据成功") break else: print('Failed to retrieve the webpage') time.sleep(2) # 关闭游标和连接 cursor.close() conn.close() print("结束")