'commit'

1 week ago · 24de098979
parent 2ad3154fe8
commit 24de098979
3 changed files with 13 additions and 77 deletions
--- a/dsLightRag/Util/WxGzhUtil.py
+++ b/dsLightRag/Util/WxGzhUtil.py
@ -1,6 +1,3 @@
 import datetime
 import random
 import requests
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service as ChromeService
@ -13,56 +10,6 @@ def init_wechat_browser():
    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
    return webdriver.Chrome(service=service, options=options)
 def get_wechat_articles(account_name, account_id, token, cookies, header):
    """获取指定公众号的文章列表"""
    article_urls = []
    # 搜索微信公众号的接口地址
    search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
    query_id = {
        'action': 'search_biz',
        'token': token,
        'lang': 'zh_CN',
        'f': 'json',
        'ajax': '1',
        'random': random.random(),
        'query': account_name,
        'begin': '0',
        'count': '5'
    }
    # 完整实现搜索和获取文章逻辑
    search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
    lists = search_response.json().get('list')[0]
    fakeid = lists.get('fakeid')
    # 微信公众号文章接口
    appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
    query_id_data = {
        'token': token,
        'lang': 'zh_CN',
        'f': 'json',
        'ajax': '1',
        'random': random.random(),
        'action': 'list_ex',
        'begin': '0',
        'count': '5',
        'query': '',
        'fakeid': fakeid,
        'type': '9'
    }
    query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
    fakeid_list = query_fakeid_response.json().get('app_msg_list')
    for item in fakeid_list:
        article_urls.append({
            'title': item.get('title'),
            'url': item.get('link'),
            'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')
        })
    return article_urls
 def get_article_content(url):
    """
--- a/dsLightRag/Util/pycache/WxGzhUtil.cpython-310.pyc
+++ b/dsLightRag/Util/pycache/WxGzhUtil.cpython-310.pyc
--- a/dsLightRag/WxGzh/T2_GetArticleList.py
+++ b/dsLightRag/WxGzh/T2_GetArticleList.py
@ -5,6 +5,7 @@
 # https://blog.csdn.net/yajuanpi4899/article/details/121584268
 import asyncio
 import datetime
 import json
 import logging
@ -12,9 +13,10 @@ import random
 import re
 import requests
-import asyncio
+
 from Util.PostgreSQLUtil import init_postgres_pool
-from Util.WxGzhUtil import init_wechat_browser, get_wechat_articles
+from Util.WxGzhUtil import init_wechat_browser, get_article_content
 async def get_wechat_sources():
    """从t_wechat_source表获取微信公众号列表"""
@ -109,7 +111,6 @@ if __name__ == '__main__':
    # 爬取文章
    for item in gzlist:
        article_urls = get_wechat_articles(item["account_name"], item["account_id"], token, cookies, header)
        account_name = item["account_name"]
        account_id = item["account_id"]
        # 搜索微信公众号的接口地址
@ -155,27 +156,15 @@ if __name__ == '__main__':
        for item in fakeid_list:
            # 采集item示例
-            new_article = {
+            article_url = item.get('link')
-                'title': item.get('title'),
+            article_title = item.get('title')
-                'article_url': item.get('link'),
+            publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')
-                'account_id': account_id,
+            
-                'account_name': account_name,
+            # 直接获取并显示文章内容
-                'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime(
+            print(f"正在处理文章: {article_title} ({publish_time})")
-                    '%Y-%m-%d %H:%M:%S'),
+            content = get_article_content(article_url)
-                'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+            print(f"文章内容预览: {content[:200]}...")
-            }
+            
            logging.info("new_article:", new_article)
            article_urls.append({"title": item.get('title'), "url": item.get('link'),
                                 "publish_time": datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime(
                                     '%Y-%m-%d %H:%M:%S')})
            time.sleep(1)
    for x in article_urls:
        print(x)
    # 将返回的地址写入到文件
    with open('article_urls.txt', 'w', encoding='utf-8') as f:
        for record in article_urls:
            f.write(record['title'] + " " + record['publish_time'] + " " + record['url'] + '\n')
        # 关闭浏览器
    driver.quit()