'commit'

1 week ago · 642d3af0ea
parent 76d0c09bdd
commit 642d3af0ea
1 changed files with 3 additions and 5 deletions
--- a/dsLightRag/WxGzh/T2_CollectArticle.py
+++ b/dsLightRag/WxGzh/T2_CollectArticle.py
@ -89,7 +89,6 @@ if __name__ == '__main__':
        content = f.read()
    # 使用json还原为json对象
    cookies = json.loads(content)
-    # "expiry": 1787106233
    # 检查是否有过期时间
    expiry = cookies["expiry"]
    if expiry:
@ -108,7 +107,7 @@ if __name__ == '__main__':
    logger.info(f"cookies的过期时间一般是4天，cookies过期时间：%s" % expiry_date)
    options = Options()
    options.add_argument('-headless')  # 无头参数，调试时可以注释掉
-    # 设置headers - 使用微信内置浏览器的User-Agent
+    # 设置headers
    header = {
        "HOST": "mp.weixin.qq.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)",
@ -121,8 +120,7 @@ if __name__ == '__main__':
    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
    # 使用统一的初始化方式
    driver = init_wechat_browser()
-
-    # 方法3：使用requests库发送请求获取重定向URL
+    # 方法：使用requests库发送请求获取重定向URL
    url = 'https://mp.weixin.qq.com'
    response = requests.get(url=url, allow_redirects=False, cookies=cookies)
    if 'Location' in response.headers:
@ -194,7 +192,7 @@ if __name__ == '__main__':
            article_title = item.get('title')
            publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time")))

-            if '试卷' in article_title:  # 过滤掉试卷
+            if '试卷' in article_title:  # 过滤掉试卷,致知物理中有大量试卷，我做教育资讯的不关心试卷
                continue

            logger.info(f"正在处理文章: {article_title} ({publish_time})")