diff --git a/dsLightRag/Test/T1_Login.py b/dsLightRag/Test/T1_Login.py new file mode 100644 index 00000000..6db1e6f1 --- /dev/null +++ b/dsLightRag/Test/T1_Login.py @@ -0,0 +1,162 @@ +# 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 +# https://blog.csdn.net/k352733625/article/details/149222945 + +# 微信爬爬猫---公众号文章抓取代码分析 +# https://blog.csdn.net/yajuanpi4899/article/details/121584268 + +""" +安装pdfkit库 +复制 +pip3 install pdfkit -i http://pypi.douban.com/simple --trusted-host pypi.douban.com +1. +import pdfkit +pdfkit.from_url('公众号文章地址', 'out.pdf') +""" +import datetime +import logging +import random +import re + +import requests + +""" +# 查看selenium版本 +pip show selenium +4.34.2 + +# 查看Chrome浏览器版本 +chrome://version/ +138.0.7204.101 (正式版本) (64 位) + +# 下载驱动包 +https://googlechromelabs.github.io/chrome-for-testing/ +https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip +""" +import time +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService + +if __name__ == '__main__': + # 定义一个空的字典,存放cookies内容 + cookies = {} + # 设置headers - 使用微信内置浏览器的User-Agent + header = { + "HOST": "mp.weixin.qq.com", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4", + "Connection": "keep-alive" + } + # 用webdriver启动谷歌浏览器 + logging.info("启动浏览器,打开微信公众号登录界面") + options = Options() + # options.add_argument('-headless') # 无头参数,调试时可以注释掉 + + # 设置微信内置浏览器的User-Agent + options.add_argument( + '--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)') + + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + driver = webdriver.Chrome(service=service, options=options) + # 打开微信公众号登录页面 + driver.get('https://mp.weixin.qq.com/') + # 等待5秒钟 + time.sleep(2) + # # 拿手机扫二维码! + logging.info("请拿手机扫码二维码登录公众号") + time.sleep(20) + + # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 + driver.get('https://mp.weixin.qq.com/') + # 获取cookies + cookie_items = driver.get_cookies() + # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 + for cookie_item in cookie_items: + cookies[cookie_item['name']] = cookie_item['value'] + + if "slave_sid" not in cookies: + logging.info("登录公众号失败,获取cookie失败") + exit() + # cookies = json.dumps(post) # 注释掉这一行 + + # 方法3:使用requests库发送请求获取重定向URL + url = 'https://mp.weixin.qq.com' + response = requests.get(url=url, allow_redirects=False, cookies=cookies) + if 'Location' in response.headers: + redirect_url = response.headers.get("Location") + print("重定向URL:", redirect_url) + token_match = re.findall(r'token=(\d+)', redirect_url) + if token_match: + token = token_match[0] + print("获取到的token:", token) + logging.info("微信token:" + token) + + article_urls = [] + gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}] + for item in gzlist: + account_name = item["account_name"] + account_id = item["account_id"] + # 搜索微信公众号的接口地址 + search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' + # 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字 + query_id = { + 'action': 'search_biz', + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'query': account_name, + 'begin': '0', + 'count': '5' + } + # 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers + search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) + # 取搜索结果中的第一个公众号 + lists = search_response.json().get('list')[0] + # 获取这个公众号的fakeid,后面爬取公众号文章需要此字段 + fakeid = lists.get('fakeid') + logging.info("fakeid:" + fakeid) + # 微信公众号文章接口地址 + appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' + # 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random + query_id_data = { + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'action': 'list_ex', + 'begin': '0', # 不同页,此参数变化,变化规则为每页加5 + 'count': '5', + 'query': '', + 'fakeid': fakeid, + 'type': '9' + } + # 打开搜索的微信公众号文章列表页 + query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) + fakeid_list = query_fakeid_response.json().get('app_msg_list') + + for item in fakeid_list: + # 采集item示例 + new_article = { + 'title': item.get('title'), + 'article_url': item.get('link'), + 'account_id': account_id, + 'account_name': account_name, + 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime( + '%Y-%m-%d %H:%M:%S'), + 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + logging.info("new_article:", new_article) + article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')}) + time.sleep(1) + + for x in article_urls: + print(x) + + # 关闭浏览器 + driver.quit() + print("所有文章爬取完成!") diff --git a/dsLightRag/Test/T2_GetList.py b/dsLightRag/Test/T2_GetList.py new file mode 100644 index 00000000..6db1e6f1 --- /dev/null +++ b/dsLightRag/Test/T2_GetList.py @@ -0,0 +1,162 @@ +# 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 +# https://blog.csdn.net/k352733625/article/details/149222945 + +# 微信爬爬猫---公众号文章抓取代码分析 +# https://blog.csdn.net/yajuanpi4899/article/details/121584268 + +""" +安装pdfkit库 +复制 +pip3 install pdfkit -i http://pypi.douban.com/simple --trusted-host pypi.douban.com +1. +import pdfkit +pdfkit.from_url('公众号文章地址', 'out.pdf') +""" +import datetime +import logging +import random +import re + +import requests + +""" +# 查看selenium版本 +pip show selenium +4.34.2 + +# 查看Chrome浏览器版本 +chrome://version/ +138.0.7204.101 (正式版本) (64 位) + +# 下载驱动包 +https://googlechromelabs.github.io/chrome-for-testing/ +https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip +""" +import time +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService + +if __name__ == '__main__': + # 定义一个空的字典,存放cookies内容 + cookies = {} + # 设置headers - 使用微信内置浏览器的User-Agent + header = { + "HOST": "mp.weixin.qq.com", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4", + "Connection": "keep-alive" + } + # 用webdriver启动谷歌浏览器 + logging.info("启动浏览器,打开微信公众号登录界面") + options = Options() + # options.add_argument('-headless') # 无头参数,调试时可以注释掉 + + # 设置微信内置浏览器的User-Agent + options.add_argument( + '--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)') + + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + driver = webdriver.Chrome(service=service, options=options) + # 打开微信公众号登录页面 + driver.get('https://mp.weixin.qq.com/') + # 等待5秒钟 + time.sleep(2) + # # 拿手机扫二维码! + logging.info("请拿手机扫码二维码登录公众号") + time.sleep(20) + + # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 + driver.get('https://mp.weixin.qq.com/') + # 获取cookies + cookie_items = driver.get_cookies() + # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 + for cookie_item in cookie_items: + cookies[cookie_item['name']] = cookie_item['value'] + + if "slave_sid" not in cookies: + logging.info("登录公众号失败,获取cookie失败") + exit() + # cookies = json.dumps(post) # 注释掉这一行 + + # 方法3:使用requests库发送请求获取重定向URL + url = 'https://mp.weixin.qq.com' + response = requests.get(url=url, allow_redirects=False, cookies=cookies) + if 'Location' in response.headers: + redirect_url = response.headers.get("Location") + print("重定向URL:", redirect_url) + token_match = re.findall(r'token=(\d+)', redirect_url) + if token_match: + token = token_match[0] + print("获取到的token:", token) + logging.info("微信token:" + token) + + article_urls = [] + gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}] + for item in gzlist: + account_name = item["account_name"] + account_id = item["account_id"] + # 搜索微信公众号的接口地址 + search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' + # 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字 + query_id = { + 'action': 'search_biz', + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'query': account_name, + 'begin': '0', + 'count': '5' + } + # 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers + search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) + # 取搜索结果中的第一个公众号 + lists = search_response.json().get('list')[0] + # 获取这个公众号的fakeid,后面爬取公众号文章需要此字段 + fakeid = lists.get('fakeid') + logging.info("fakeid:" + fakeid) + # 微信公众号文章接口地址 + appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' + # 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random + query_id_data = { + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'action': 'list_ex', + 'begin': '0', # 不同页,此参数变化,变化规则为每页加5 + 'count': '5', + 'query': '', + 'fakeid': fakeid, + 'type': '9' + } + # 打开搜索的微信公众号文章列表页 + query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) + fakeid_list = query_fakeid_response.json().get('app_msg_list') + + for item in fakeid_list: + # 采集item示例 + new_article = { + 'title': item.get('title'), + 'article_url': item.get('link'), + 'account_id': account_id, + 'account_name': account_name, + 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime( + '%Y-%m-%d %H:%M:%S'), + 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + logging.info("new_article:", new_article) + article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')}) + time.sleep(1) + + for x in article_urls: + print(x) + + # 关闭浏览器 + driver.quit() + print("所有文章爬取完成!") diff --git a/dsLightRag/Test/TestCrawl.py b/dsLightRag/Test/TestCrawl.py deleted file mode 100644 index 63d2eb14..00000000 --- a/dsLightRag/Test/TestCrawl.py +++ /dev/null @@ -1,270 +0,0 @@ -# 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 -# https://blog.csdn.net/k352733625/article/details/149222945 - -# 微信爬爬猫---公众号文章抓取代码分析 -# https://blog.csdn.net/yajuanpi4899/article/details/121584268 -import datetime -import logging -import random -import re -import os - -import requests - -""" -# 查看selenium版本 -pip show selenium -4.34.2 - -# 查看Chrome浏览器版本 -chrome://version/ -138.0.7204.101 (正式版本) (64 位) - -# 下载驱动包 -https://googlechromelabs.github.io/chrome-for-testing/ -https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip -""" -import time -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.chrome.service import Service as ChromeService -import json - -if __name__ == '__main__': - # 定义一个空的字典,存放cookies内容 - cookies = {} - # 设置headers - 使用微信内置浏览器的User-Agent - header = { - "HOST": "mp.weixin.qq.com", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Encoding": "gzip, deflate, br", - "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4", - "Connection": "keep-alive" - } - # 用webdriver启动谷歌浏览器 - logging.info("启动浏览器,打开微信公众号登录界面") - options = Options() - # options.add_argument('-headless') # 无头参数,调试时可以注释掉 - - # 设置微信内置浏览器的User-Agent - options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)') - - service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") - driver = webdriver.Chrome(service=service, options=options) - # 打开微信公众号登录页面 - driver.get('https://mp.weixin.qq.com/') - # 等待5秒钟 - time.sleep(2) - # # 拿手机扫二维码! - logging.info("请拿手机扫码二维码登录公众号") - time.sleep(20) - - # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 - driver.get('https://mp.weixin.qq.com/') - # 获取cookies - cookie_items = driver.get_cookies() - # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 - for cookie_item in cookie_items: - cookies[cookie_item['name']] = cookie_item['value'] - - if "slave_sid" not in cookies: - logging.info("登录公众号失败,获取cookie失败") - exit() - # cookies = json.dumps(post) # 注释掉这一行 - - # 方法3:使用requests库发送请求获取重定向URL - url = 'https://mp.weixin.qq.com' - response = requests.get(url=url, allow_redirects=False, cookies=cookies) - if 'Location' in response.headers: - redirect_url = response.headers.get("Location") - print("重定向URL:", redirect_url) - token_match = re.findall(r'token=(\d+)', redirect_url) - if token_match: - token = token_match[0] - print("获取到的token:", token) - logging.info("微信token:" + token) - - article_urls = [] - gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}] - for item in gzlist: - account_name = item["account_name"] - account_id = item["account_id"] - # 搜索微信公众号的接口地址 - search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' - # 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字 - query_id = { - 'action': 'search_biz', - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'query': account_name, - 'begin': '0', - 'count': '5' - } - # 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers - search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) - # 取搜索结果中的第一个公众号 - lists = search_response.json().get('list')[0] - # 获取这个公众号的fakeid,后面爬取公众号文章需要此字段 - fakeid = lists.get('fakeid') - logging.info("fakeid:" + fakeid) - # 微信公众号文章接口地址 - appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' - # 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random - query_id_data = { - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'action': 'list_ex', - 'begin': '0', # 不同页,此参数变化,变化规则为每页加5 - 'count': '5', - 'query': '', - 'fakeid': fakeid, - 'type': '9' - } - # 打开搜索的微信公众号文章列表页 - query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) - fakeid_list = query_fakeid_response.json().get('app_msg_list') - - for item in fakeid_list: - # 采集item示例 - new_article = { - 'title': item.get('title'), - 'article_url': item.get('link'), - 'account_id': account_id, - 'account_name': account_name, - 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'), - 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') - } - print("new_article:", new_article) - logging.info("new_article:", new_article) - article_urls.append(item.get('link')) - time.sleep(1) - - # 确保Logs目录存在 - logs_dir = "./Test/Logs" - if not os.path.exists(logs_dir): - os.makedirs(logs_dir) - - for article_url in article_urls: - print("正在爬取文章:" + article_url) - try: - # 使用requests直接获取文章内容,模拟微信环境 - wechat_headers = { - "User-Agent": "Mozilla/5.0 (Linux; Android 10; MI 8 Build/QKQ1.190828.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/83.0.4103.101 Mobile Safari/537.36 XWEB/1768 MMWEBSDK/20210302 MMWEBID/6253 MicroMessenger/8.0.2.1860(0x28000234) Process/toolsmp WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", - "Accept-Encoding": "gzip, deflate", - "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", - "X-Requested-With": "com.tencent.mm", - "Referer": "https://mp.weixin.qq.com/" - } - - # 使用selenium打开文章链接,设置请求头 - driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': wechat_headers}) - driver.get(article_url) - # 增加等待时间,确保页面完全加载 - time.sleep(5) - - # 检查是否需要登录 - if "请在微信客户端中打开链接" in driver.page_source or "请在微信中打开此链接" in driver.page_source: - print(f"文章需要在微信中打开,尝试使用requests直接获取:{article_url}") - # 尝试使用requests直接获取 - response = requests.get(article_url, headers=wechat_headers, cookies=cookies) - if "请在微信客户端中打开链接" in response.text or "请在微信中打开此链接" in response.text: - print(f"使用requests仍然无法获取,跳过此文章:{article_url}") - continue - else: - # 保存获取到的HTML内容 - filename = f"article_{article_url.split('sn=')[1][:10] if 'sn=' in article_url else 'unknown'}" - save_path = f"{logs_dir}/{filename}.html" - with open(save_path, "w", encoding="utf-8") as f: - f.write(response.text) - print(f"已保存文章HTML内容:{save_path}") - continue - - # 使用更可靠的选择器查找标题和内容 - try: - # 尝试多种可能的标题选择器 - title_selectors = [ - '//h1[@class="rich_media_title"]', - '//h1[@id="activity-name"]', - '//h2[@class="rich_media_title"]', - '//div[@class="rich_media_content"]//h1', - '//div[@id="js_article"]//h1' - ] - - title = None - for selector in title_selectors: - try: - title_element = driver.find_element('xpath', selector) - title = title_element.text.strip() - if title: - break - except: - continue - - if not title: - # 如果所有选择器都失败,尝试从页面标题获取 - title = driver.title.replace(" - 微信公众号", "").strip() - - # 尝试多种可能的内容选择器 - content_selectors = [ - '//div[@class="rich_media_content"]', - '//div[@id="js_content"]', - '//div[@class="rich_media_wrp"]' - ] - - content = None - for selector in content_selectors: - try: - content_element = driver.find_element('xpath', selector) - content = content_element.text.strip() - if content: - break - except: - continue - - if not content: - # 如果无法获取内容,至少保存页面源码 - content = "无法提取正文内容,保存页面源码:\n" + driver.page_source - - # 创建文件名(使用标题,但去除不合法的文件名字符) - if not title: - title = "未知标题_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "未知标题" - - filename = re.sub(r'[\\/:*?"<>|]', '_', title) - - # 保存文章内容到文件 - save_path = f"{logs_dir}/{filename}.txt" - with open(save_path, "w", encoding="utf-8") as f: - f.write(f"标题:{title}\n\n") - f.write(f"链接:{article_url}\n\n") - f.write(f"内容:\n{content}") - - print(f"文章《{title}》保存成功:{save_path}") - - except Exception as e: - print(f"提取文章内容失败:{str(e)}") - # 保存页面源码以便分析 - error_filename = "error_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "error_page" - error_path = f"{logs_dir}/{error_filename}.html" - with open(error_path, "w", encoding="utf-8") as f: - f.write(driver.page_source) - print(f"已保存页面源码到:{error_path}") - - # 避免频繁请求被封 - time.sleep(random.uniform(3, 7)) - - except Exception as e: - print(f"爬取文章失败:{article_url},错误信息:{str(e)}") - continue - - # 关闭浏览器 - driver.quit() - print("所有文章爬取完成!") -