diff --git a/dsLightRag/Util/WxGzhUtil.py b/dsLightRag/Util/WxGzhUtil.py index d4bf9138..07abec69 100644 --- a/dsLightRag/Util/WxGzhUtil.py +++ b/dsLightRag/Util/WxGzhUtil.py @@ -1,6 +1,3 @@ -import datetime -import random -import requests from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService @@ -13,56 +10,6 @@ def init_wechat_browser(): service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") return webdriver.Chrome(service=service, options=options) -def get_wechat_articles(account_name, account_id, token, cookies, header): - """获取指定公众号的文章列表""" - article_urls = [] - - # 搜索微信公众号的接口地址 - search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' - query_id = { - 'action': 'search_biz', - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'query': account_name, - 'begin': '0', - 'count': '5' - } - - # 完整实现搜索和获取文章逻辑 - search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) - lists = search_response.json().get('list')[0] - fakeid = lists.get('fakeid') - - # 微信公众号文章接口 - appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' - query_id_data = { - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'action': 'list_ex', - 'begin': '0', - 'count': '5', - 'query': '', - 'fakeid': fakeid, - 'type': '9' - } - - query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) - fakeid_list = query_fakeid_response.json().get('app_msg_list') - - for item in fakeid_list: - article_urls.append({ - 'title': item.get('title'), - 'url': item.get('link'), - 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S') - }) - - return article_urls def get_article_content(url): """ diff --git a/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc index b9742d49..5075ff6c 100644 Binary files a/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc and b/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc differ diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index 8aff8387..153e43a6 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -5,6 +5,7 @@ # https://blog.csdn.net/yajuanpi4899/article/details/121584268 +import asyncio import datetime import json import logging @@ -12,9 +13,10 @@ import random import re import requests -import asyncio + from Util.PostgreSQLUtil import init_postgres_pool -from Util.WxGzhUtil import init_wechat_browser, get_wechat_articles +from Util.WxGzhUtil import init_wechat_browser, get_article_content + async def get_wechat_sources(): """从t_wechat_source表获取微信公众号列表""" @@ -109,7 +111,6 @@ if __name__ == '__main__': # 爬取文章 for item in gzlist: - article_urls = get_wechat_articles(item["account_name"], item["account_id"], token, cookies, header) account_name = item["account_name"] account_id = item["account_id"] # 搜索微信公众号的接口地址 @@ -155,27 +156,15 @@ if __name__ == '__main__': for item in fakeid_list: # 采集item示例 - new_article = { - 'title': item.get('title'), - 'article_url': item.get('link'), - 'account_id': account_id, - 'account_name': account_name, - 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime( - '%Y-%m-%d %H:%M:%S'), - 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') - } - logging.info("new_article:", new_article) - article_urls.append({"title": item.get('title'), "url": item.get('link'), - "publish_time": datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime( - '%Y-%m-%d %H:%M:%S')}) + article_url = item.get('link') + article_title = item.get('title') + publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S') + + # 直接获取并显示文章内容 + print(f"正在处理文章: {article_title} ({publish_time})") + content = get_article_content(article_url) + print(f"文章内容预览: {content[:200]}...") + time.sleep(1) - - for x in article_urls: - print(x) - # 将返回的地址写入到文件 - with open('article_urls.txt', 'w', encoding='utf-8') as f: - for record in article_urls: - f.write(record['title'] + " " + record['publish_time'] + " " + record['url'] + '\n') - # 关闭浏览器 driver.quit()