'commit'

1 week ago · 2ad3154fe8
parent af3f8098c4
commit 2ad3154fe8
6 changed files with 122 additions and 55 deletions
--- a/dsLightRag/Util/WxGzhUtil.py
+++ b/dsLightRag/Util/WxGzhUtil.py
@ -0,0 +1,100 @@
+import datetime
+import random
+import requests
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service as ChromeService
+from selenium.webdriver.common.by import By
+
+def init_wechat_browser():
+    """初始化微信爬虫浏览器实例"""
+    options = Options()
+    options.add_argument('-headless')
+    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
+    return webdriver.Chrome(service=service, options=options)
+
+def get_wechat_articles(account_name, account_id, token, cookies, header):
+    """获取指定公众号的文章列表"""
+    article_urls = []
+    
+    # 搜索微信公众号的接口地址
+    search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
+    query_id = {
+        'action': 'search_biz',
+        'token': token,
+        'lang': 'zh_CN',
+        'f': 'json',
+        'ajax': '1',
+        'random': random.random(),
+        'query': account_name,
+        'begin': '0',
+        'count': '5'
+    }
+    
+    # 完整实现搜索和获取文章逻辑
+    search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
+    lists = search_response.json().get('list')[0]
+    fakeid = lists.get('fakeid')
+    
+    # 微信公众号文章接口
+    appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
+    query_id_data = {
+        'token': token,
+        'lang': 'zh_CN',
+        'f': 'json',
+        'ajax': '1',
+        'random': random.random(),
+        'action': 'list_ex',
+        'begin': '0',
+        'count': '5',
+        'query': '',
+        'fakeid': fakeid,
+        'type': '9'
+    }
+    
+    query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
+    fakeid_list = query_fakeid_response.json().get('app_msg_list')
+    
+    for item in fakeid_list:
+        article_urls.append({
+            'title': item.get('title'),
+            'url': item.get('link'),
+            'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')
+        })
+    
+    return article_urls
+
+def get_article_content(url):
+    """
+    获取微信公众号文章内容
+    :param url: 文章URL
+    :return: 文章内容文本
+    """
+    options = Options()
+    options.add_argument('-headless')
+    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
+    driver = webdriver.Chrome(service=service, options=options)
+    
+    try:
+        driver.get(url)
+        html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
+        
+        # 处理内容，提取空行后的文本
+        lines = html_content.split('\n')
+        content_after_empty_line = ""
+        found_empty_line = False
+
+        for line in lines:
+            if not found_empty_line and line.strip() == "":
+                found_empty_line = True
+                continue
+
+            if found_empty_line:
+                content_after_empty_line += line + "\n"
+
+        if not found_empty_line:
+            content_after_empty_line = html_content
+
+        return content_after_empty_line.replace("\n\n", "\n")
+    finally:
+        driver.quit()
--- a/dsLightRag/Util/pycache/WxGzhUtil.cpython-310.pyc
+++ b/dsLightRag/Util/pycache/WxGzhUtil.cpython-310.pyc
--- a/dsLightRag/WxGzh/T2_GetArticleList.py
+++ b/dsLightRag/WxGzh/T2_GetArticleList.py
@ -14,6 +14,7 @@ import re
 import requests
 import asyncio
 from Util.PostgreSQLUtil import init_postgres_pool
+from Util.WxGzhUtil import init_wechat_browser, get_wechat_articles

 async def get_wechat_sources():
    """从t_wechat_source表获取微信公众号列表"""
@ -78,7 +79,9 @@ if __name__ == '__main__':
    }

    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
-    driver = webdriver.Chrome(service=service, options=options)
+    driver = webdriver.Chrome(service=service, options=options)  # 删除这行
+    # 使用统一的初始化方式
+    driver = init_wechat_browser()

    # 方法3：使用requests库发送请求获取重定向URL
    url = 'https://mp.weixin.qq.com'
@ -93,7 +96,10 @@ if __name__ == '__main__':
            logging.info("微信token:" + token)

    article_urls = []
-    # 替换硬编码的gzlist
+    # 初始化浏览器
+    driver = init_wechat_browser()
+    
+    # 获取公众号列表
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
@ -101,7 +107,9 @@ if __name__ == '__main__':
    finally:
        loop.close()
    
+    # 爬取文章
    for item in gzlist:
+        article_urls = get_wechat_articles(item["account_name"], item["account_id"], token, cookies, header)
        account_name = item["account_name"]
        account_id = item["account_id"]
        # 搜索微信公众号的接口地址
@ -171,4 +179,3 @@ if __name__ == '__main__':

        # 关闭浏览器
    driver.quit()
-    print("所有文章爬取完成！")
--- a/dsLightRag/WxGzh/Util/WxGzhUtil.py
+++ b/dsLightRag/WxGzh/Util/WxGzhUtil.py
@ -1,45 +0,0 @@
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.chrome.service import Service as ChromeService
-from selenium.webdriver.common.by import By
-
-def get_article_content(url):
-    """
-    获取微信公众号文章内容
-    :param url: 文章URL
-    :return: 文章内容文本
-    """
-    options = Options()
-    options.add_argument('-headless')
-    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
-    driver = webdriver.Chrome(service=service, options=options)
-    
-    try:
-        driver.get(url)
-        html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
-        
-        # 处理内容，提取空行后的文本
-        lines = html_content.split('\n')
-        content_after_empty_line = ""
-        found_empty_line = False
-
-        for line in lines:
-            if not found_empty_line and line.strip() == "":
-                found_empty_line = True
-                continue
-
-            if found_empty_line:
-                content_after_empty_line += line + "\n"
-
-        if not found_empty_line:
-            content_after_empty_line = html_content
-
-        return content_after_empty_line.replace("\n\n", "\n")
-    finally:
-        driver.quit()
-
-if __name__ == '__main__':
-    # 示例用法
-    url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd'
-    content = get_article_content(url)
-    print(content)
--- a/dsLightRag/WxGzh/Util/init.py
+++ b/dsLightRag/WxGzh/Util/init.py
--- a/dsLightRag/WxGzh/article_urls.txt
+++ b/dsLightRag/WxGzh/article_urls.txt
@ -1,9 +1,14 @@
-明日（16日）公布高中一批次录取结果，查询通道在此，请收好！ 2025-07-15 08:38:04 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546664&idx=1&sn=431c1f89b968ddff2165466ce20b2976&chksm=feb6a485c9c12d930da2340a813d24d5dd168688af162b4e7bdcd42d5c31d832ef9dc915b1dc#rd
-长春2024-2025九上试卷合集（赠答案） 2025-07-15 08:38:04 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546664&idx=2&sn=ffd3261777f03cdfcd78e7a899f25778&chksm=feb6a485c9c12d93a16db0ce8d9850cc843b22442b5d49850f93c99efbbc160855ff1a2314cf#rd
-网传各初中2025中考最高分准吗？ 2025-07-13 09:22:08 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546662&idx=1&sn=5c8a0cc82f0aab69a600d06b6e63a57f&chksm=feb6a48bc9c12d9da8ed3b2a19d12fa275f83796201448996bdfa2204bf41f7d826430ae30be#rd
-长春市2025年中考各批次录取最低控制线确定 2025-07-12 10:05:49 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546644&idx=1&sn=ea3371033b95e7203e881947c980a8a4&chksm=feb6a4b9c9c12daf64cbd87239cd2fdc22a93e6e0d2555ce3c5b66bc0e96028bfc3565021201#rd
-长春市2025年中考成绩将于7月12日12时公布 2025-07-11 15:13:54 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546642&idx=1&sn=718e0fa8463273260dae093e0686b7e0&chksm=feb6a4bfc9c12da9ab59ff8d7da3caefbb44d8195bb694b31cd5b3cab8fc6505b20c61a0ae62#rd
-长春2024-2025九上试卷合集（赠答案） 2025-07-09 10:56:48 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546544&idx=1&sn=d07bf1b38403c0578ad67ae007ce6159&chksm=feb6a51dc9c12c0b9b90a2131a9ba913b92ed2eab3dcaa78fadccb6b231e4f5cb4247750f910#rd
+长春中考上演“神仙打架”！省二力旺等五校过半考生超700分！ 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd
+独家专访赫行学校2025年中考“双黄蛋”！学霸靠啥杀出重围？ 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=2&sn=c7733f7c2c6331e51e55af695f99a43e&chksm=84e1ab0cb396221a7d185dcb99acc9dce45cc5c66c3eef42680a215b710bb9bfa9fd10da4419#rd
+长春40所学校中考成绩曝光！700+成批涌现！谁是最大黑马？ 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=1&sn=1f314640ae6eec236b0e16271bd44362&chksm=84e1ab1db396220b73ae08898a026d887436501a6c42abe01d7fa4aef9063533fad89720d3b8#rd
+喜报！长春外国语学校女子篮球队夺得冠军！ 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=2&sn=31651043acb6ecbf4232e92e635196b6&chksm=84e1ab1db396220b0810c3bdf332128b110d1902658f2556eaeff67cec084a8a068a5ae9a275#rd
+“趣闯盛夏·探无界”！探秘一实验银河小学夏令营 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=3&sn=8edf6ce8cebdaad55343b39639876c27&chksm=84e1ab1db396220b26b172b3b565f919f7ded4c2a5b78227294ea29a558a7666c33b8c1de660#rd
+刚刚！2025年长春中考各批次控制线公布！ 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=1&sn=282e5e824410a9a92a83dd800cb58a7c&chksm=84e1aba6b39622b03fe6422032474c9696f83541d9ff9b8b6a9f0f099ce459da430f720d05e4#rd
+重磅消息！师大附属实验学校（经开）校长有新任命！ 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=2&sn=9449c87935faf86ddcc5a674ea888913&chksm=84e1aba6b39622b03fd8413ff1e74b61f662ec8deb3887c2c5b8e5ad15470b15ae14b21e94ea#rd
+市教育局最新发布！长春2025年中考成绩将于7月12日公布！ 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=1&sn=a5b4104d2fe74ace32ab31faf5f1c44c&chksm=84e1abb2b39622a40b0e0969e84fb00c753cc8ffefb8624726afa2a7352ea725c7f967bf25f5#rd
+长春市第十九中学2025年职称评聘拟通过人员名单的公示！有你认识的老师吗？ 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=2&sn=693d13964e4be18718c0eb4fd13ba68f&chksm=84e1abb2b39622a442a9f7cea8ddc72820050b2896968f2d0ae283c7caca2dbe014a721feb2e#rd
+高分喜报频传！长春这所小学靠啥成为“学霸制造机”？ 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=1&sn=a0af7f484d6a3300a9b7f3d787a2594d&chksm=84e1ab9cb396228a56420696eb09071ff829d58e8e31bd652f849f3cbd0ee276b0baad7a1e89#rd
+蝉联冠军！吉大尚德游泳队斩获骄人成绩！ 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=2&sn=cabb682e99978bf2a58ff0e9e06dc53d&chksm=84e1ab9cb396228a5cd457cd7ee0728491e6b3dc34fbde02240624364cfa8a9e2c533052d2b4#rd
 长春中考上演“神仙打架”！省二力旺等五校过半考生超700分！ 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd
 独家专访赫行学校2025年中考“双黄蛋”！学霸靠啥杀出重围？ 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=2&sn=c7733f7c2c6331e51e55af695f99a43e&chksm=84e1ab0cb396221a7d185dcb99acc9dce45cc5c66c3eef42680a215b710bb9bfa9fd10da4419#rd
 长春40所学校中考成绩曝光！700+成批涌现！谁是最大黑马？ 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=1&sn=1f314640ae6eec236b0e16271bd44362&chksm=84e1ab1db396220b73ae08898a026d887436501a6c42abe01d7fa4aef9063533fad89720d3b8#rd