'commit'

2 weeks ago · 0f161ab1c3
parent 69e8e833e6
commit 0f161ab1c3
3 changed files with 324 additions and 270 deletions
--- a/dsLightRag/Test/T1_Login.py
+++ b/dsLightRag/Test/T1_Login.py
@ -0,0 +1,162 @@
+# 详解（一）Python + Selenium 批量采集微信公众号，搭建自己的微信公众号每日AI简报，告别信息焦虑
+# https://blog.csdn.net/k352733625/article/details/149222945
+
+# 微信爬爬猫---公众号文章抓取代码分析
+# https://blog.csdn.net/yajuanpi4899/article/details/121584268
+
+"""
+安装pdfkit库
+复制
+pip3 install pdfkit -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
+1.
+import pdfkit
+pdfkit.from_url('公众号文章地址', 'out.pdf')
+"""
+import datetime
+import logging
+import random
+import re
+
+import requests
+
+"""
+# 查看selenium版本
+pip show selenium
+4.34.2
+
+# 查看Chrome浏览器版本
+chrome://version/
+138.0.7204.101 (正式版本) （64 位）
+
+# 下载驱动包
+https://googlechromelabs.github.io/chrome-for-testing/
+https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
+"""
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service as ChromeService
+
+if __name__ == '__main__':
+    # 定义一个空的字典，存放cookies内容
+    cookies = {}
+    # 设置headers - 使用微信内置浏览器的User-Agent
+    header = {
+        "HOST": "mp.weixin.qq.com",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+        "Accept-Encoding": "gzip, deflate, br",
+        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",
+        "Connection": "keep-alive"
+    }
+    # 用webdriver启动谷歌浏览器
+    logging.info("启动浏览器，打开微信公众号登录界面")
+    options = Options()
+    # options.add_argument('-headless')  # 无头参数，调试时可以注释掉
+
+    # 设置微信内置浏览器的User-Agent
+    options.add_argument(
+        '--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)')
+
+    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
+    driver = webdriver.Chrome(service=service, options=options)
+    # 打开微信公众号登录页面
+    driver.get('https://mp.weixin.qq.com/')
+    # 等待5秒钟
+    time.sleep(2)
+    # # 拿手机扫二维码！
+    logging.info("请拿手机扫码二维码登录公众号")
+    time.sleep(20)
+
+    # 重新载入公众号登录页，登录之后会显示公众号后台首页，从这个返回内容中获取cookies信息
+    driver.get('https://mp.weixin.qq.com/')
+    # 获取cookies
+    cookie_items = driver.get_cookies()
+    # 获取到的cookies是列表形式，将cookies转成json形式并存入本地名为cookie的文本中
+    for cookie_item in cookie_items:
+        cookies[cookie_item['name']] = cookie_item['value']
+
+    if "slave_sid" not in cookies:
+        logging.info("登录公众号失败，获取cookie失败")
+        exit()
+    # cookies = json.dumps(post)  # 注释掉这一行
+
+    # 方法3：使用requests库发送请求获取重定向URL
+    url = 'https://mp.weixin.qq.com'
+    response = requests.get(url=url, allow_redirects=False, cookies=cookies)
+    if 'Location' in response.headers:
+        redirect_url = response.headers.get("Location")
+        print("重定向URL:", redirect_url)
+        token_match = re.findall(r'token=(\d+)', redirect_url)
+        if token_match:
+            token = token_match[0]
+            print("获取到的token:", token)
+            logging.info("微信token:" + token)
+
+    article_urls = []
+    gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}]
+    for item in gzlist:
+        account_name = item["account_name"]
+        account_id = item["account_id"]
+        # 搜索微信公众号的接口地址
+        search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
+        # 搜索微信公众号接口需要传入的参数，有三个变量：微信公众号token、随机数random、搜索的微信公众号名字
+        query_id = {
+            'action': 'search_biz',
+            'token': token,
+            'lang': 'zh_CN',
+            'f': 'json',
+            'ajax': '1',
+            'random': random.random(),
+            'query': account_name,
+            'begin': '0',
+            'count': '5'
+        }
+        # 打开搜索微信公众号接口地址，需要传入相关参数信息如：cookies、params、headers
+        search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
+        # 取搜索结果中的第一个公众号
+        lists = search_response.json().get('list')[0]
+        # 获取这个公众号的fakeid，后面爬取公众号文章需要此字段
+        fakeid = lists.get('fakeid')
+        logging.info("fakeid:" + fakeid)
+        # 微信公众号文章接口地址
+        appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
+        # 搜索文章需要传入几个参数：登录的公众号token、要爬取文章的公众号fakeid、随机数random
+        query_id_data = {
+            'token': token,
+            'lang': 'zh_CN',
+            'f': 'json',
+            'ajax': '1',
+            'random': random.random(),
+            'action': 'list_ex',
+            'begin': '0',  # 不同页，此参数变化，变化规则为每页加5
+            'count': '5',
+            'query': '',
+            'fakeid': fakeid,
+            'type': '9'
+        }
+        # 打开搜索的微信公众号文章列表页
+        query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
+        fakeid_list = query_fakeid_response.json().get('app_msg_list')
+
+        for item in fakeid_list:
+            # 采集item示例
+            new_article = {
+                'title': item.get('title'),
+                'article_url': item.get('link'),
+                'account_id': account_id,
+                'account_name': account_name,
+                'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime(
+                    '%Y-%m-%d %H:%M:%S'),
+                'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+            }
+            logging.info("new_article:", new_article)
+            article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')})
+            time.sleep(1)
+
+    for x in article_urls:
+        print(x)
+
+        # 关闭浏览器
+    driver.quit()
+    print("所有文章爬取完成！")
--- a/dsLightRag/Test/T2_GetList.py
+++ b/dsLightRag/Test/T2_GetList.py
@ -0,0 +1,162 @@
+# 详解（一）Python + Selenium 批量采集微信公众号，搭建自己的微信公众号每日AI简报，告别信息焦虑
+# https://blog.csdn.net/k352733625/article/details/149222945
+
+# 微信爬爬猫---公众号文章抓取代码分析
+# https://blog.csdn.net/yajuanpi4899/article/details/121584268
+
+"""
+安装pdfkit库
+复制
+pip3 install pdfkit -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
+1.
+import pdfkit
+pdfkit.from_url('公众号文章地址', 'out.pdf')
+"""
+import datetime
+import logging
+import random
+import re
+
+import requests
+
+"""
+# 查看selenium版本
+pip show selenium
+4.34.2
+
+# 查看Chrome浏览器版本
+chrome://version/
+138.0.7204.101 (正式版本) （64 位）
+
+# 下载驱动包
+https://googlechromelabs.github.io/chrome-for-testing/
+https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
+"""
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service as ChromeService
+
+if __name__ == '__main__':
+    # 定义一个空的字典，存放cookies内容
+    cookies = {}
+    # 设置headers - 使用微信内置浏览器的User-Agent
+    header = {
+        "HOST": "mp.weixin.qq.com",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+        "Accept-Encoding": "gzip, deflate, br",
+        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",
+        "Connection": "keep-alive"
+    }
+    # 用webdriver启动谷歌浏览器
+    logging.info("启动浏览器，打开微信公众号登录界面")
+    options = Options()
+    # options.add_argument('-headless')  # 无头参数，调试时可以注释掉
+
+    # 设置微信内置浏览器的User-Agent
+    options.add_argument(
+        '--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)')
+
+    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
+    driver = webdriver.Chrome(service=service, options=options)
+    # 打开微信公众号登录页面
+    driver.get('https://mp.weixin.qq.com/')
+    # 等待5秒钟
+    time.sleep(2)
+    # # 拿手机扫二维码！
+    logging.info("请拿手机扫码二维码登录公众号")
+    time.sleep(20)
+
+    # 重新载入公众号登录页，登录之后会显示公众号后台首页，从这个返回内容中获取cookies信息
+    driver.get('https://mp.weixin.qq.com/')
+    # 获取cookies
+    cookie_items = driver.get_cookies()
+    # 获取到的cookies是列表形式，将cookies转成json形式并存入本地名为cookie的文本中
+    for cookie_item in cookie_items:
+        cookies[cookie_item['name']] = cookie_item['value']
+
+    if "slave_sid" not in cookies:
+        logging.info("登录公众号失败，获取cookie失败")
+        exit()
+    # cookies = json.dumps(post)  # 注释掉这一行
+
+    # 方法3：使用requests库发送请求获取重定向URL
+    url = 'https://mp.weixin.qq.com'
+    response = requests.get(url=url, allow_redirects=False, cookies=cookies)
+    if 'Location' in response.headers:
+        redirect_url = response.headers.get("Location")
+        print("重定向URL:", redirect_url)
+        token_match = re.findall(r'token=(\d+)', redirect_url)
+        if token_match:
+            token = token_match[0]
+            print("获取到的token:", token)
+            logging.info("微信token:" + token)
+
+    article_urls = []
+    gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}]
+    for item in gzlist:
+        account_name = item["account_name"]
+        account_id = item["account_id"]
+        # 搜索微信公众号的接口地址
+        search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
+        # 搜索微信公众号接口需要传入的参数，有三个变量：微信公众号token、随机数random、搜索的微信公众号名字
+        query_id = {
+            'action': 'search_biz',
+            'token': token,
+            'lang': 'zh_CN',
+            'f': 'json',
+            'ajax': '1',
+            'random': random.random(),
+            'query': account_name,
+            'begin': '0',
+            'count': '5'
+        }
+        # 打开搜索微信公众号接口地址，需要传入相关参数信息如：cookies、params、headers
+        search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
+        # 取搜索结果中的第一个公众号
+        lists = search_response.json().get('list')[0]
+        # 获取这个公众号的fakeid，后面爬取公众号文章需要此字段
+        fakeid = lists.get('fakeid')
+        logging.info("fakeid:" + fakeid)
+        # 微信公众号文章接口地址
+        appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
+        # 搜索文章需要传入几个参数：登录的公众号token、要爬取文章的公众号fakeid、随机数random
+        query_id_data = {
+            'token': token,
+            'lang': 'zh_CN',
+            'f': 'json',
+            'ajax': '1',
+            'random': random.random(),
+            'action': 'list_ex',
+            'begin': '0',  # 不同页，此参数变化，变化规则为每页加5
+            'count': '5',
+            'query': '',
+            'fakeid': fakeid,
+            'type': '9'
+        }
+        # 打开搜索的微信公众号文章列表页
+        query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
+        fakeid_list = query_fakeid_response.json().get('app_msg_list')
+
+        for item in fakeid_list:
+            # 采集item示例
+            new_article = {
+                'title': item.get('title'),
+                'article_url': item.get('link'),
+                'account_id': account_id,
+                'account_name': account_name,
+                'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime(
+                    '%Y-%m-%d %H:%M:%S'),
+                'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+            }
+            logging.info("new_article:", new_article)
+            article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')})
+            time.sleep(1)
+
+    for x in article_urls:
+        print(x)
+
+        # 关闭浏览器
+    driver.quit()
+    print("所有文章爬取完成！")
--- a/dsLightRag/Test/TestCrawl.py
+++ b/dsLightRag/Test/TestCrawl.py
@ -1,270 +0,0 @@
-# 详解（一）Python + Selenium 批量采集微信公众号，搭建自己的微信公众号每日AI简报，告别信息焦虑
-# https://blog.csdn.net/k352733625/article/details/149222945
-
-# 微信爬爬猫---公众号文章抓取代码分析
-# https://blog.csdn.net/yajuanpi4899/article/details/121584268
-import datetime
-import logging
-import random
-import re
-import os
-
-import requests
-
-"""
-# 查看selenium版本
-pip show selenium
-4.34.2
-
-# 查看Chrome浏览器版本
-chrome://version/
-138.0.7204.101 (正式版本) （64 位）
-
-# 下载驱动包
-https://googlechromelabs.github.io/chrome-for-testing/
-https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
-"""
-import time
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.chrome.service import Service as ChromeService
-import json
-
-if __name__ == '__main__':
-    # 定义一个空的字典，存放cookies内容
-    cookies = {}
-    # 设置headers - 使用微信内置浏览器的User-Agent
-    header = {
-        "HOST": "mp.weixin.qq.com",
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)",
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
-        "Accept-Encoding": "gzip, deflate, br",
-        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",
-        "Connection": "keep-alive"
-    }
-    # 用webdriver启动谷歌浏览器
-    logging.info("启动浏览器，打开微信公众号登录界面")
-    options = Options()
-    # options.add_argument('-headless')  # 无头参数，调试时可以注释掉
-    
-    # 设置微信内置浏览器的User-Agent
-    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)')
-    
-    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
-    driver = webdriver.Chrome(service=service, options=options)
-    # 打开微信公众号登录页面
-    driver.get('https://mp.weixin.qq.com/')
-    # 等待5秒钟
-    time.sleep(2)
-    # # 拿手机扫二维码！
-    logging.info("请拿手机扫码二维码登录公众号")
-    time.sleep(20)
-
-    # 重新载入公众号登录页，登录之后会显示公众号后台首页，从这个返回内容中获取cookies信息
-    driver.get('https://mp.weixin.qq.com/')
-    # 获取cookies
-    cookie_items = driver.get_cookies()
-    # 获取到的cookies是列表形式，将cookies转成json形式并存入本地名为cookie的文本中
-    for cookie_item in cookie_items:
-        cookies[cookie_item['name']] = cookie_item['value']
-
-    if "slave_sid" not in cookies:
-        logging.info("登录公众号失败，获取cookie失败")
-        exit()
-    # cookies = json.dumps(post)  # 注释掉这一行
-
-    # 方法3：使用requests库发送请求获取重定向URL
-    url = 'https://mp.weixin.qq.com'
-    response = requests.get(url=url, allow_redirects=False, cookies=cookies)
-    if 'Location' in response.headers:
-        redirect_url = response.headers.get("Location")
-        print("重定向URL:", redirect_url)
-        token_match = re.findall(r'token=(\d+)', redirect_url)
-        if token_match:
-            token = token_match[0]
-            print("获取到的token:", token)
-            logging.info("微信token:" + token)
-
-    article_urls = []
-    gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}]
-    for item in gzlist:
-        account_name = item["account_name"]
-        account_id = item["account_id"]
-        # 搜索微信公众号的接口地址
-        search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
-        # 搜索微信公众号接口需要传入的参数，有三个变量：微信公众号token、随机数random、搜索的微信公众号名字
-        query_id = {
-            'action': 'search_biz',
-            'token': token,
-            'lang': 'zh_CN',
-            'f': 'json',
-            'ajax': '1',
-            'random': random.random(),
-            'query': account_name,
-            'begin': '0',
-            'count': '5'
-        }
-        # 打开搜索微信公众号接口地址，需要传入相关参数信息如：cookies、params、headers
-        search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
-        # 取搜索结果中的第一个公众号
-        lists = search_response.json().get('list')[0]
-        # 获取这个公众号的fakeid，后面爬取公众号文章需要此字段
-        fakeid = lists.get('fakeid')
-        logging.info("fakeid:" + fakeid)
-        # 微信公众号文章接口地址
-        appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
-        # 搜索文章需要传入几个参数：登录的公众号token、要爬取文章的公众号fakeid、随机数random
-        query_id_data = {
-            'token': token,
-            'lang': 'zh_CN',
-            'f': 'json',
-            'ajax': '1',
-            'random': random.random(),
-            'action': 'list_ex',
-            'begin': '0',  # 不同页，此参数变化，变化规则为每页加5
-            'count': '5',
-            'query': '',
-            'fakeid': fakeid,
-            'type': '9'
-        }
-        # 打开搜索的微信公众号文章列表页
-        query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
-        fakeid_list = query_fakeid_response.json().get('app_msg_list')
-
-        for item in fakeid_list:
-            # 采集item示例
-            new_article = {
-                'title': item.get('title'),
-                'article_url': item.get('link'),
-                'account_id': account_id,
-                'account_name': account_name,
-                'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'),
-                'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-            }
-            print("new_article:", new_article)
-            logging.info("new_article:", new_article)
-            article_urls.append(item.get('link'))
-            time.sleep(1)
-
-    # 确保Logs目录存在
-    logs_dir = "./Test/Logs"
-    if not os.path.exists(logs_dir):
-        os.makedirs(logs_dir)
-
-    for article_url in article_urls:
-        print("正在爬取文章：" + article_url)
-        try:
-            # 使用requests直接获取文章内容，模拟微信环境
-            wechat_headers = {
-                "User-Agent": "Mozilla/5.0 (Linux; Android 10; MI 8 Build/QKQ1.190828.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/83.0.4103.101 Mobile Safari/537.36 XWEB/1768 MMWEBSDK/20210302 MMWEBID/6253 MicroMessenger/8.0.2.1860(0x28000234) Process/toolsmp WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64",
-                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-                "Accept-Encoding": "gzip, deflate",
-                "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
-                "X-Requested-With": "com.tencent.mm",
-                "Referer": "https://mp.weixin.qq.com/"
-            }
-            
-            # 使用selenium打开文章链接，设置请求头
-            driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': wechat_headers})
-            driver.get(article_url)
-            # 增加等待时间，确保页面完全加载
-            time.sleep(5)
-            
-            # 检查是否需要登录
-            if "请在微信客户端中打开链接" in driver.page_source or "请在微信中打开此链接" in driver.page_source:
-                print(f"文章需要在微信中打开，尝试使用requests直接获取：{article_url}")
-                # 尝试使用requests直接获取
-                response = requests.get(article_url, headers=wechat_headers, cookies=cookies)
-                if "请在微信客户端中打开链接" in response.text or "请在微信中打开此链接" in response.text:
-                    print(f"使用requests仍然无法获取，跳过此文章：{article_url}")
-                    continue
-                else:
-                    # 保存获取到的HTML内容
-                    filename = f"article_{article_url.split('sn=')[1][:10] if 'sn=' in article_url else 'unknown'}"
-                    save_path = f"{logs_dir}/{filename}.html"
-                    with open(save_path, "w", encoding="utf-8") as f:
-                        f.write(response.text)
-                    print(f"已保存文章HTML内容：{save_path}")
-                    continue
-                
-            # 使用更可靠的选择器查找标题和内容
-            try:
-                # 尝试多种可能的标题选择器
-                title_selectors = [
-                    '//h1[@class="rich_media_title"]', 
-                    '//h1[@id="activity-name"]',
-                    '//h2[@class="rich_media_title"]',
-                    '//div[@class="rich_media_content"]//h1',
-                    '//div[@id="js_article"]//h1'
-                ]
-                
-                title = None
-                for selector in title_selectors:
-                    try:
-                        title_element = driver.find_element('xpath', selector)
-                        title = title_element.text.strip()
-                        if title:
-                            break
-                    except:
-                        continue
-                
-                if not title:
-                    # 如果所有选择器都失败，尝试从页面标题获取
-                    title = driver.title.replace(" - 微信公众号", "").strip()
-                
-                # 尝试多种可能的内容选择器
-                content_selectors = [
-                    '//div[@class="rich_media_content"]',
-                    '//div[@id="js_content"]',
-                    '//div[@class="rich_media_wrp"]'
-                ]
-                
-                content = None
-                for selector in content_selectors:
-                    try:
-                        content_element = driver.find_element('xpath', selector)
-                        content = content_element.text.strip()
-                        if content:
-                            break
-                    except:
-                        continue
-                
-                if not content:
-                    # 如果无法获取内容，至少保存页面源码
-                    content = "无法提取正文内容，保存页面源码：\n" + driver.page_source
-                
-                # 创建文件名（使用标题，但去除不合法的文件名字符）
-                if not title:
-                    title = "未知标题_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "未知标题"
-                    
-                filename = re.sub(r'[\\/:*?"<>|]', '_', title)
-                
-                # 保存文章内容到文件
-                save_path = f"{logs_dir}/{filename}.txt"
-                with open(save_path, "w", encoding="utf-8") as f:
-                    f.write(f"标题：{title}\n\n")
-                    f.write(f"链接：{article_url}\n\n")
-                    f.write(f"内容：\n{content}")
-                    
-                print(f"文章《{title}》保存成功：{save_path}")
-                
-            except Exception as e:
-                print(f"提取文章内容失败：{str(e)}")
-                # 保存页面源码以便分析
-                error_filename = "error_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "error_page"
-                error_path = f"{logs_dir}/{error_filename}.html"
-                with open(error_path, "w", encoding="utf-8") as f:
-                    f.write(driver.page_source)
-                print(f"已保存页面源码到：{error_path}")
-            
-            # 避免频繁请求被封
-            time.sleep(random.uniform(3, 7))
-            
-        except Exception as e:
-            print(f"爬取文章失败：{article_url}，错误信息：{str(e)}")
-            continue
-    
-    # 关闭浏览器
-    driver.quit()
-    print("所有文章爬取完成！")
-