From 2ad3154fe87ddd18612f75a77ff603c68439edfd Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:09:16 +0800 Subject: [PATCH] 'commit' --- dsLightRag/Util/WxGzhUtil.py | 100 ++++++++++++++++++ .../__pycache__/WxGzhUtil.cpython-310.pyc | Bin 0 -> 2483 bytes dsLightRag/WxGzh/T2_GetArticleList.py | 15 ++- dsLightRag/WxGzh/Util/WxGzhUtil.py | 45 -------- dsLightRag/WxGzh/Util/__init__.py | 0 dsLightRag/WxGzh/article_urls.txt | 17 +-- 6 files changed, 122 insertions(+), 55 deletions(-) create mode 100644 dsLightRag/Util/WxGzhUtil.py create mode 100644 dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc delete mode 100644 dsLightRag/WxGzh/Util/WxGzhUtil.py delete mode 100644 dsLightRag/WxGzh/Util/__init__.py diff --git a/dsLightRag/Util/WxGzhUtil.py b/dsLightRag/Util/WxGzhUtil.py new file mode 100644 index 00000000..d4bf9138 --- /dev/null +++ b/dsLightRag/Util/WxGzhUtil.py @@ -0,0 +1,100 @@ +import datetime +import random +import requests +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService +from selenium.webdriver.common.by import By + +def init_wechat_browser(): + """初始化微信爬虫浏览器实例""" + options = Options() + options.add_argument('-headless') + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + return webdriver.Chrome(service=service, options=options) + +def get_wechat_articles(account_name, account_id, token, cookies, header): + """获取指定公众号的文章列表""" + article_urls = [] + + # 搜索微信公众号的接口地址 + search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' + query_id = { + 'action': 'search_biz', + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'query': account_name, + 'begin': '0', + 'count': '5' + } + + # 完整实现搜索和获取文章逻辑 + search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) + lists = search_response.json().get('list')[0] + fakeid = lists.get('fakeid') + + # 微信公众号文章接口 + appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' + query_id_data = { + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'action': 'list_ex', + 'begin': '0', + 'count': '5', + 'query': '', + 'fakeid': fakeid, + 'type': '9' + } + + query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) + fakeid_list = query_fakeid_response.json().get('app_msg_list') + + for item in fakeid_list: + article_urls.append({ + 'title': item.get('title'), + 'url': item.get('link'), + 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S') + }) + + return article_urls + +def get_article_content(url): + """ + 获取微信公众号文章内容 + :param url: 文章URL + :return: 文章内容文本 + """ + options = Options() + options.add_argument('-headless') + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + driver = webdriver.Chrome(service=service, options=options) + + try: + driver.get(url) + html_content = driver.find_element(By.CLASS_NAME, "rich_media").text + + # 处理内容,提取空行后的文本 + lines = html_content.split('\n') + content_after_empty_line = "" + found_empty_line = False + + for line in lines: + if not found_empty_line and line.strip() == "": + found_empty_line = True + continue + + if found_empty_line: + content_after_empty_line += line + "\n" + + if not found_empty_line: + content_after_empty_line = html_content + + return content_after_empty_line.replace("\n\n", "\n") + finally: + driver.quit() \ No newline at end of file diff --git a/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9742d49875cc692f0c4f8f3f838f7f92bca1af9 GIT binary patch literal 2483 zcmai0>x&#k5TEXO?(E|(m&Ew?l$^Ng-KkN;MU8rDMDmcF5?oG(x#_;$x!ZYMPp|i2 z76e~`81pS+h_d@hL&AL%5hQ-{pXjeR(CvkbgN><_eNg*c=HW0kn13}cO;MXg@>8Ftc+*O4e~&}G8HcPRtGK9TY; zi8IMAQub~bP+;*(TY*#`Khj+Tehcu+9s^$^!rjU43~ zG0tMx*6bbx_2KC+hF_f-esX?zb?4d-gBz!>-2D9V^>5DI{Oa`Z^RI_HUtYU<=2mI8 zO?}d#S$6B-LSyMv7?WfpTUy-8IE@ZHu@tmf64Ao*Il4(NAu$0%(B=MehkB{c+n2Dg zv*HxOPDUp)nGP<9icg5=vsO2vF&EVZWo>k1CJ!jAVy=!ET$#qoKrU%)5{6rkz1&zL z*{OuB!SDu4R%yV2I~umy{G{JndW(mhrBjy=4X`p{xX^ zG6#r-UF4z~VpAXlCXn$XcayuTNDE+ETF)e?XAzu3go!~+uJyG6#9m%EaW0m6Hb<*C zw{kr`uafYW8w}FDc_;>N_h#S$^i9TQtg%hZj02Et%=+vlVs1ua<8H+ zmX*bdvRKWlBa2gDqn6iR*50|NH<6otBcA}vdk093`v!=ct7hNgwvzV`(EE@feTUoz zo*cNS%WPRcmYa|px2xQ^`RTXAbLX#r^6_xz!tlc@*S`O3c<$R97e2Ut{^J{$`@_?p z-5h+q{YabhG;7SyN9o)K4L8GhZhd_&NTT_m70xb)@q9*o7PObc?Zd(?cn<_&ZnwRK zV`8B19vtkh?Il!2Wv;|#&Q32Vh)qaoRT=zm@J(=^IjhlMRa z_UNY21bU`kW_N-ac9$e~OLC7S_eyeyB-4`IFUft9JP1S>d@H4p>1pbLMNis;6NbD) zrDyS)D0frhbL#OhqT73B-kzO_W@pIs%xjIAV~v@`y3QVzcJwamh)UXB1`BPaTra7u zg~KSsB;%RTTa>f?Qrm+4XiS77O-sw7x&ly@T*mz<6?zDWb09D)a;`oJ9tJ9wUhIRm zJC;IHocRG|wTLa~G9Cs1aPTSHy<6#^qi)9wiI^DOIHOq##f%D59z839;ui6h-;3(_ z6Ayg##r{GpZpz#Jx;&at@iRgwKdK58WRG-iFC2ah?va0)gl_ga5Ca=J#uyc^H6DMP zs!jpw|6CidjP!n;s@0H#4dlRkCvtx`EcObR3uHQtA6Y_V1yS{32N+d}QI7smiLt80 zI7hM)_w?=Oa=mZnn#3P)fX`Kjm;=08>KVDwcZmhKHi-?D#_8F)3HaU*b;QmZP)o`K zjo53LH3(Fj_1WBJFG0O=h;=}FTI*SICOeWFJuA2R6=*xw0H2b(R0kT;7I(P~tzPa@ zaN3=QrvQG@5q5z;Lj6Y{4tKs&S`CG)X}G+`bg}r>$)m*zqrA)F@$^uA11i@qUQz4J zf}NNxiHsPN9l7bUFlc+wMnYd8m&soC>YLlxb?Y_NCRkOf)+CWVflWyA1dwKB1sak^ zJ5=@$CTn)nT{!yU;-Yu_#bd7k^ynslElWG095@S8VKbU``~bkW-VHfGXkJ>|Z7#wp znpv~b=26EBl9;$OSC)ktHcU68K2M*v2p@_q7 zRP@gK$0C&H(Z7qiqG8XC`f$;O|3h!oxS9N;p>Cjg_rDoUq9}>ymbcUus0OM~qTKvT i2qYSr!m9IZl#p(RJ}+^U?E~Uy5@MCn>tM&O9Qp%Tx!MB& literal 0 HcmV?d00001 diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index a9b9ae09..8aff8387 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -14,6 +14,7 @@ import re import requests import asyncio from Util.PostgreSQLUtil import init_postgres_pool +from Util.WxGzhUtil import init_wechat_browser, get_wechat_articles async def get_wechat_sources(): """从t_wechat_source表获取微信公众号列表""" @@ -78,7 +79,9 @@ if __name__ == '__main__': } service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") - driver = webdriver.Chrome(service=service, options=options) + driver = webdriver.Chrome(service=service, options=options) # 删除这行 + # 使用统一的初始化方式 + driver = init_wechat_browser() # 方法3:使用requests库发送请求获取重定向URL url = 'https://mp.weixin.qq.com' @@ -93,15 +96,20 @@ if __name__ == '__main__': logging.info("微信token:" + token) article_urls = [] - # 替换硬编码的gzlist + # 初始化浏览器 + driver = init_wechat_browser() + + # 获取公众号列表 loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: gzlist = loop.run_until_complete(get_wechat_sources()) finally: loop.close() - + + # 爬取文章 for item in gzlist: + article_urls = get_wechat_articles(item["account_name"], item["account_id"], token, cookies, header) account_name = item["account_name"] account_id = item["account_id"] # 搜索微信公众号的接口地址 @@ -171,4 +179,3 @@ if __name__ == '__main__': # 关闭浏览器 driver.quit() - print("所有文章爬取完成!") diff --git a/dsLightRag/WxGzh/Util/WxGzhUtil.py b/dsLightRag/WxGzh/Util/WxGzhUtil.py deleted file mode 100644 index 6b6d55b1..00000000 --- a/dsLightRag/WxGzh/Util/WxGzhUtil.py +++ /dev/null @@ -1,45 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.chrome.service import Service as ChromeService -from selenium.webdriver.common.by import By - -def get_article_content(url): - """ - 获取微信公众号文章内容 - :param url: 文章URL - :return: 文章内容文本 - """ - options = Options() - options.add_argument('-headless') - service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") - driver = webdriver.Chrome(service=service, options=options) - - try: - driver.get(url) - html_content = driver.find_element(By.CLASS_NAME, "rich_media").text - - # 处理内容,提取空行后的文本 - lines = html_content.split('\n') - content_after_empty_line = "" - found_empty_line = False - - for line in lines: - if not found_empty_line and line.strip() == "": - found_empty_line = True - continue - - if found_empty_line: - content_after_empty_line += line + "\n" - - if not found_empty_line: - content_after_empty_line = html_content - - return content_after_empty_line.replace("\n\n", "\n") - finally: - driver.quit() - -if __name__ == '__main__': - # 示例用法 - url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd' - content = get_article_content(url) - print(content) diff --git a/dsLightRag/WxGzh/Util/__init__.py b/dsLightRag/WxGzh/Util/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dsLightRag/WxGzh/article_urls.txt b/dsLightRag/WxGzh/article_urls.txt index 7e5eccc0..dc2077eb 100644 --- a/dsLightRag/WxGzh/article_urls.txt +++ b/dsLightRag/WxGzh/article_urls.txt @@ -1,9 +1,14 @@ -明日(16日)公布高中一批次录取结果,查询通道在此,请收好! 2025-07-15 08:38:04 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546664&idx=1&sn=431c1f89b968ddff2165466ce20b2976&chksm=feb6a485c9c12d930da2340a813d24d5dd168688af162b4e7bdcd42d5c31d832ef9dc915b1dc#rd -长春2024-2025九上试卷合集(赠答案) 2025-07-15 08:38:04 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546664&idx=2&sn=ffd3261777f03cdfcd78e7a899f25778&chksm=feb6a485c9c12d93a16db0ce8d9850cc843b22442b5d49850f93c99efbbc160855ff1a2314cf#rd -网传各初中2025中考最高分准吗? 2025-07-13 09:22:08 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546662&idx=1&sn=5c8a0cc82f0aab69a600d06b6e63a57f&chksm=feb6a48bc9c12d9da8ed3b2a19d12fa275f83796201448996bdfa2204bf41f7d826430ae30be#rd -长春市2025年中考各批次录取最低控制线确定 2025-07-12 10:05:49 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546644&idx=1&sn=ea3371033b95e7203e881947c980a8a4&chksm=feb6a4b9c9c12daf64cbd87239cd2fdc22a93e6e0d2555ce3c5b66bc0e96028bfc3565021201#rd -长春市2025年中考成绩将于7月12日12时公布 2025-07-11 15:13:54 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546642&idx=1&sn=718e0fa8463273260dae093e0686b7e0&chksm=feb6a4bfc9c12da9ab59ff8d7da3caefbb44d8195bb694b31cd5b3cab8fc6505b20c61a0ae62#rd -长春2024-2025九上试卷合集(赠答案) 2025-07-09 10:56:48 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546544&idx=1&sn=d07bf1b38403c0578ad67ae007ce6159&chksm=feb6a51dc9c12c0b9b90a2131a9ba913b92ed2eab3dcaa78fadccb6b231e4f5cb4247750f910#rd +长春中考上演“神仙打架”!省二力旺等五校过半考生超700分! 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd +独家专访赫行学校2025年中考“双黄蛋”!学霸靠啥杀出重围? 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=2&sn=c7733f7c2c6331e51e55af695f99a43e&chksm=84e1ab0cb396221a7d185dcb99acc9dce45cc5c66c3eef42680a215b710bb9bfa9fd10da4419#rd +长春40所学校中考成绩曝光!700+成批涌现!谁是最大黑马? 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=1&sn=1f314640ae6eec236b0e16271bd44362&chksm=84e1ab1db396220b73ae08898a026d887436501a6c42abe01d7fa4aef9063533fad89720d3b8#rd +喜报!长春外国语学校女子篮球队夺得冠军! 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=2&sn=31651043acb6ecbf4232e92e635196b6&chksm=84e1ab1db396220b0810c3bdf332128b110d1902658f2556eaeff67cec084a8a068a5ae9a275#rd +“趣闯盛夏·探无界”!探秘一实验银河小学夏令营 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=3&sn=8edf6ce8cebdaad55343b39639876c27&chksm=84e1ab1db396220b26b172b3b565f919f7ded4c2a5b78227294ea29a558a7666c33b8c1de660#rd +刚刚!2025年长春中考各批次控制线公布! 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=1&sn=282e5e824410a9a92a83dd800cb58a7c&chksm=84e1aba6b39622b03fe6422032474c9696f83541d9ff9b8b6a9f0f099ce459da430f720d05e4#rd +重磅消息!师大附属实验学校(经开)校长有新任命! 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=2&sn=9449c87935faf86ddcc5a674ea888913&chksm=84e1aba6b39622b03fd8413ff1e74b61f662ec8deb3887c2c5b8e5ad15470b15ae14b21e94ea#rd +市教育局最新发布!长春2025年中考成绩将于7月12日公布! 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=1&sn=a5b4104d2fe74ace32ab31faf5f1c44c&chksm=84e1abb2b39622a40b0e0969e84fb00c753cc8ffefb8624726afa2a7352ea725c7f967bf25f5#rd +长春市第十九中学2025年职称评聘拟通过人员名单的公示!有你认识的老师吗? 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=2&sn=693d13964e4be18718c0eb4fd13ba68f&chksm=84e1abb2b39622a442a9f7cea8ddc72820050b2896968f2d0ae283c7caca2dbe014a721feb2e#rd +高分喜报频传!长春这所小学靠啥成为“学霸制造机”? 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=1&sn=a0af7f484d6a3300a9b7f3d787a2594d&chksm=84e1ab9cb396228a56420696eb09071ff829d58e8e31bd652f849f3cbd0ee276b0baad7a1e89#rd +蝉联冠军!吉大尚德游泳队斩获骄人成绩! 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=2&sn=cabb682e99978bf2a58ff0e9e06dc53d&chksm=84e1ab9cb396228a5cd457cd7ee0728491e6b3dc34fbde02240624364cfa8a9e2c533052d2b4#rd 长春中考上演“神仙打架”!省二力旺等五校过半考生超700分! 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd 独家专访赫行学校2025年中考“双黄蛋”!学霸靠啥杀出重围? 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=2&sn=c7733f7c2c6331e51e55af695f99a43e&chksm=84e1ab0cb396221a7d185dcb99acc9dce45cc5c66c3eef42680a215b710bb9bfa9fd10da4419#rd 长春40所学校中考成绩曝光!700+成批涌现!谁是最大黑马? 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=1&sn=1f314640ae6eec236b0e16271bd44362&chksm=84e1ab1db396220b73ae08898a026d887436501a6c42abe01d7fa4aef9063533fad89720d3b8#rd