diff --git a/dsLightRag/Util/WxGzhUtil.py b/dsLightRag/Util/WxGzhUtil.py new file mode 100644 index 00000000..d4bf9138 --- /dev/null +++ b/dsLightRag/Util/WxGzhUtil.py @@ -0,0 +1,100 @@ +import datetime +import random +import requests +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService +from selenium.webdriver.common.by import By + +def init_wechat_browser(): + """初始化微信爬虫浏览器实例""" + options = Options() + options.add_argument('-headless') + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + return webdriver.Chrome(service=service, options=options) + +def get_wechat_articles(account_name, account_id, token, cookies, header): + """获取指定公众号的文章列表""" + article_urls = [] + + # 搜索微信公众号的接口地址 + search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' + query_id = { + 'action': 'search_biz', + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'query': account_name, + 'begin': '0', + 'count': '5' + } + + # 完整实现搜索和获取文章逻辑 + search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) + lists = search_response.json().get('list')[0] + fakeid = lists.get('fakeid') + + # 微信公众号文章接口 + appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' + query_id_data = { + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'action': 'list_ex', + 'begin': '0', + 'count': '5', + 'query': '', + 'fakeid': fakeid, + 'type': '9' + } + + query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) + fakeid_list = query_fakeid_response.json().get('app_msg_list') + + for item in fakeid_list: + article_urls.append({ + 'title': item.get('title'), + 'url': item.get('link'), + 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S') + }) + + return article_urls + +def get_article_content(url): + """ + 获取微信公众号文章内容 + :param url: 文章URL + :return: 文章内容文本 + """ + options = Options() + options.add_argument('-headless') + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + driver = webdriver.Chrome(service=service, options=options) + + try: + driver.get(url) + html_content = driver.find_element(By.CLASS_NAME, "rich_media").text + + # 处理内容,提取空行后的文本 + lines = html_content.split('\n') + content_after_empty_line = "" + found_empty_line = False + + for line in lines: + if not found_empty_line and line.strip() == "": + found_empty_line = True + continue + + if found_empty_line: + content_after_empty_line += line + "\n" + + if not found_empty_line: + content_after_empty_line = html_content + + return content_after_empty_line.replace("\n\n", "\n") + finally: + driver.quit() \ No newline at end of file diff --git a/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc new file mode 100644 index 00000000..b9742d49 Binary files /dev/null and b/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc differ diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index a9b9ae09..8aff8387 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -14,6 +14,7 @@ import re import requests import asyncio from Util.PostgreSQLUtil import init_postgres_pool +from Util.WxGzhUtil import init_wechat_browser, get_wechat_articles async def get_wechat_sources(): """从t_wechat_source表获取微信公众号列表""" @@ -78,7 +79,9 @@ if __name__ == '__main__': } service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") - driver = webdriver.Chrome(service=service, options=options) + driver = webdriver.Chrome(service=service, options=options) # 删除这行 + # 使用统一的初始化方式 + driver = init_wechat_browser() # 方法3:使用requests库发送请求获取重定向URL url = 'https://mp.weixin.qq.com' @@ -93,15 +96,20 @@ if __name__ == '__main__': logging.info("微信token:" + token) article_urls = [] - # 替换硬编码的gzlist + # 初始化浏览器 + driver = init_wechat_browser() + + # 获取公众号列表 loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: gzlist = loop.run_until_complete(get_wechat_sources()) finally: loop.close() - + + # 爬取文章 for item in gzlist: + article_urls = get_wechat_articles(item["account_name"], item["account_id"], token, cookies, header) account_name = item["account_name"] account_id = item["account_id"] # 搜索微信公众号的接口地址 @@ -171,4 +179,3 @@ if __name__ == '__main__': # 关闭浏览器 driver.quit() - print("所有文章爬取完成!") diff --git a/dsLightRag/WxGzh/Util/WxGzhUtil.py b/dsLightRag/WxGzh/Util/WxGzhUtil.py deleted file mode 100644 index 6b6d55b1..00000000 --- a/dsLightRag/WxGzh/Util/WxGzhUtil.py +++ /dev/null @@ -1,45 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.chrome.service import Service as ChromeService -from selenium.webdriver.common.by import By - -def get_article_content(url): - """ - 获取微信公众号文章内容 - :param url: 文章URL - :return: 文章内容文本 - """ - options = Options() - options.add_argument('-headless') - service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") - driver = webdriver.Chrome(service=service, options=options) - - try: - driver.get(url) - html_content = driver.find_element(By.CLASS_NAME, "rich_media").text - - # 处理内容,提取空行后的文本 - lines = html_content.split('\n') - content_after_empty_line = "" - found_empty_line = False - - for line in lines: - if not found_empty_line and line.strip() == "": - found_empty_line = True - continue - - if found_empty_line: - content_after_empty_line += line + "\n" - - if not found_empty_line: - content_after_empty_line = html_content - - return content_after_empty_line.replace("\n\n", "\n") - finally: - driver.quit() - -if __name__ == '__main__': - # 示例用法 - url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd' - content = get_article_content(url) - print(content) diff --git a/dsLightRag/WxGzh/Util/__init__.py b/dsLightRag/WxGzh/Util/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dsLightRag/WxGzh/article_urls.txt b/dsLightRag/WxGzh/article_urls.txt index 7e5eccc0..dc2077eb 100644 --- a/dsLightRag/WxGzh/article_urls.txt +++ b/dsLightRag/WxGzh/article_urls.txt @@ -1,9 +1,14 @@ -明日(16日)公布高中一批次录取结果,查询通道在此,请收好! 2025-07-15 08:38:04 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546664&idx=1&sn=431c1f89b968ddff2165466ce20b2976&chksm=feb6a485c9c12d930da2340a813d24d5dd168688af162b4e7bdcd42d5c31d832ef9dc915b1dc#rd -长春2024-2025九上试卷合集(赠答案) 2025-07-15 08:38:04 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546664&idx=2&sn=ffd3261777f03cdfcd78e7a899f25778&chksm=feb6a485c9c12d93a16db0ce8d9850cc843b22442b5d49850f93c99efbbc160855ff1a2314cf#rd -网传各初中2025中考最高分准吗? 2025-07-13 09:22:08 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546662&idx=1&sn=5c8a0cc82f0aab69a600d06b6e63a57f&chksm=feb6a48bc9c12d9da8ed3b2a19d12fa275f83796201448996bdfa2204bf41f7d826430ae30be#rd -长春市2025年中考各批次录取最低控制线确定 2025-07-12 10:05:49 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546644&idx=1&sn=ea3371033b95e7203e881947c980a8a4&chksm=feb6a4b9c9c12daf64cbd87239cd2fdc22a93e6e0d2555ce3c5b66bc0e96028bfc3565021201#rd -长春市2025年中考成绩将于7月12日12时公布 2025-07-11 15:13:54 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546642&idx=1&sn=718e0fa8463273260dae093e0686b7e0&chksm=feb6a4bfc9c12da9ab59ff8d7da3caefbb44d8195bb694b31cd5b3cab8fc6505b20c61a0ae62#rd -长春2024-2025九上试卷合集(赠答案) 2025-07-09 10:56:48 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546544&idx=1&sn=d07bf1b38403c0578ad67ae007ce6159&chksm=feb6a51dc9c12c0b9b90a2131a9ba913b92ed2eab3dcaa78fadccb6b231e4f5cb4247750f910#rd +长春中考上演“神仙打架”!省二力旺等五校过半考生超700分! 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd +独家专访赫行学校2025年中考“双黄蛋”!学霸靠啥杀出重围? 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=2&sn=c7733f7c2c6331e51e55af695f99a43e&chksm=84e1ab0cb396221a7d185dcb99acc9dce45cc5c66c3eef42680a215b710bb9bfa9fd10da4419#rd +长春40所学校中考成绩曝光!700+成批涌现!谁是最大黑马? 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=1&sn=1f314640ae6eec236b0e16271bd44362&chksm=84e1ab1db396220b73ae08898a026d887436501a6c42abe01d7fa4aef9063533fad89720d3b8#rd +喜报!长春外国语学校女子篮球队夺得冠军! 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=2&sn=31651043acb6ecbf4232e92e635196b6&chksm=84e1ab1db396220b0810c3bdf332128b110d1902658f2556eaeff67cec084a8a068a5ae9a275#rd +“趣闯盛夏·探无界”!探秘一实验银河小学夏令营 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=3&sn=8edf6ce8cebdaad55343b39639876c27&chksm=84e1ab1db396220b26b172b3b565f919f7ded4c2a5b78227294ea29a558a7666c33b8c1de660#rd +刚刚!2025年长春中考各批次控制线公布! 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=1&sn=282e5e824410a9a92a83dd800cb58a7c&chksm=84e1aba6b39622b03fe6422032474c9696f83541d9ff9b8b6a9f0f099ce459da430f720d05e4#rd +重磅消息!师大附属实验学校(经开)校长有新任命! 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=2&sn=9449c87935faf86ddcc5a674ea888913&chksm=84e1aba6b39622b03fd8413ff1e74b61f662ec8deb3887c2c5b8e5ad15470b15ae14b21e94ea#rd +市教育局最新发布!长春2025年中考成绩将于7月12日公布! 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=1&sn=a5b4104d2fe74ace32ab31faf5f1c44c&chksm=84e1abb2b39622a40b0e0969e84fb00c753cc8ffefb8624726afa2a7352ea725c7f967bf25f5#rd +长春市第十九中学2025年职称评聘拟通过人员名单的公示!有你认识的老师吗? 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=2&sn=693d13964e4be18718c0eb4fd13ba68f&chksm=84e1abb2b39622a442a9f7cea8ddc72820050b2896968f2d0ae283c7caca2dbe014a721feb2e#rd +高分喜报频传!长春这所小学靠啥成为“学霸制造机”? 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=1&sn=a0af7f484d6a3300a9b7f3d787a2594d&chksm=84e1ab9cb396228a56420696eb09071ff829d58e8e31bd652f849f3cbd0ee276b0baad7a1e89#rd +蝉联冠军!吉大尚德游泳队斩获骄人成绩! 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=2&sn=cabb682e99978bf2a58ff0e9e06dc53d&chksm=84e1ab9cb396228a5cd457cd7ee0728491e6b3dc34fbde02240624364cfa8a9e2c533052d2b4#rd 长春中考上演“神仙打架”!省二力旺等五校过半考生超700分! 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd 独家专访赫行学校2025年中考“双黄蛋”!学霸靠啥杀出重围? 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=2&sn=c7733f7c2c6331e51e55af695f99a43e&chksm=84e1ab0cb396221a7d185dcb99acc9dce45cc5c66c3eef42680a215b710bb9bfa9fd10da4419#rd 长春40所学校中考成绩曝光!700+成批涌现!谁是最大黑马? 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=1&sn=1f314640ae6eec236b0e16271bd44362&chksm=84e1ab1db396220b73ae08898a026d887436501a6c42abe01d7fa4aef9063533fad89720d3b8#rd