diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index de048fb6..d39b3539 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -89,7 +89,6 @@ if __name__ == '__main__': content = f.read() # 使用json还原为json对象 cookies = json.loads(content) - # "expiry": 1787106233 # 检查是否有过期时间 expiry = cookies["expiry"] if expiry: @@ -108,7 +107,7 @@ if __name__ == '__main__': logger.info(f"cookies的过期时间一般是4天,cookies过期时间:%s" % expiry_date) options = Options() options.add_argument('-headless') # 无头参数,调试时可以注释掉 - # 设置headers - 使用微信内置浏览器的User-Agent + # 设置headers header = { "HOST": "mp.weixin.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)", @@ -121,8 +120,7 @@ if __name__ == '__main__': service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") # 使用统一的初始化方式 driver = init_wechat_browser() - - # 方法3:使用requests库发送请求获取重定向URL + # 方法:使用requests库发送请求获取重定向URL url = 'https://mp.weixin.qq.com' response = requests.get(url=url, allow_redirects=False, cookies=cookies) if 'Location' in response.headers: @@ -194,7 +192,7 @@ if __name__ == '__main__': article_title = item.get('title') publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))) - if '试卷' in article_title: # 过滤掉试卷 + if '试卷' in article_title: # 过滤掉试卷,致知物理中有大量试卷,我做教育资讯的不关心试卷 continue logger.info(f"正在处理文章: {article_title} ({publish_time})")