From 24de098979caeea9e6c0add92c234cbbc2126229 Mon Sep 17 00:00:00 2001
From: HuangHai <10402852@qq.com>
Date: Tue, 15 Jul 2025 11:15:40 +0800
Subject: [PATCH] 'commit'

---
 dsLightRag/Util/WxGzhUtil.py                  |  53 ------------------
 .../__pycache__/WxGzhUtil.cpython-310.pyc     | Bin 2483 -> 1364 bytes
 dsLightRag/WxGzh/T2_GetArticleList.py         |  37 +++++-------
 3 files changed, 13 insertions(+), 77 deletions(-)

diff --git a/dsLightRag/Util/WxGzhUtil.py b/dsLightRag/Util/WxGzhUtil.py
index d4bf9138..07abec69 100644
--- a/dsLightRag/Util/WxGzhUtil.py
+++ b/dsLightRag/Util/WxGzhUtil.py
@@ -1,6 +1,3 @@
-import datetime
-import random
-import requests
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service as ChromeService
@@ -13,56 +10,6 @@ def init_wechat_browser():
     service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
     return webdriver.Chrome(service=service, options=options)
 
-def get_wechat_articles(account_name, account_id, token, cookies, header):
-    """获取指定公众号的文章列表"""
-    article_urls = []
-    
-    # 搜索微信公众号的接口地址
-    search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
-    query_id = {
-        'action': 'search_biz',
-        'token': token,
-        'lang': 'zh_CN',
-        'f': 'json',
-        'ajax': '1',
-        'random': random.random(),
-        'query': account_name,
-        'begin': '0',
-        'count': '5'
-    }
-    
-    # 完整实现搜索和获取文章逻辑
-    search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
-    lists = search_response.json().get('list')[0]
-    fakeid = lists.get('fakeid')
-    
-    # 微信公众号文章接口
-    appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
-    query_id_data = {
-        'token': token,
-        'lang': 'zh_CN',
-        'f': 'json',
-        'ajax': '1',
-        'random': random.random(),
-        'action': 'list_ex',
-        'begin': '0',
-        'count': '5',
-        'query': '',
-        'fakeid': fakeid,
-        'type': '9'
-    }
-    
-    query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
-    fakeid_list = query_fakeid_response.json().get('app_msg_list')
-    
-    for item in fakeid_list:
-        article_urls.append({
-            'title': item.get('title'),
-            'url': item.get('link'),
-            'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')
-        })
-    
-    return article_urls
 
 def get_article_content(url):
     """
diff --git a/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc
index b9742d49875cc692f0c4f8f3f838f7f92bca1af9..5075ff6ce5e093e8119550083361d00de35b186a 100644
GIT binary patch
delta 198
zcmdlie1(fIpO=@50SNXUFU_!Jo5&~Q?*indFr+Z%Fyt~uF*1VKOgT)s%u&o>HggV3
zE^8DUn9Y*Ip34!%0c5kLu(dEmai*}RZ~$qp6wY7<O|F+9gE!8qWfW!uGKx8X1RE0<
zBNrndW0BBiFSa;F##@uuuq(0NVopyjnS7bu1;{Eb%9$*~QOn3Tc@~E;yF5^jpC<R@
bM;yJ3@{>zB)dYBeB0LN{j3CIt#KQpqwXP?V

delta 1323
zcmaJ=L66%+6!zF-$99}#n>Nsvwv>ugjbJw{g#&x3(6$mnYI~@JL?o~r&urq|*zt}h
z>^8EM14ZqDvY?e(soFz)Ljob89(v)*Kj47oOxkkdgg7Db#z~5};E})g-preCzW2ud
z#eRIeI;>P|1s?J9sQ-uaP4&!#KtvRR21;jcQ3n;V+EKx(11-cIjO3a&&_knR6h0o5
zLbGEPzCN(Sa;FTwK}vU&jzdghfvpgml)+XBx}~(7-{rwKT1Z%Xv`bj<8D%Z><Glz?
zELccP07JErjjD*%A@Fs8<+B2g1Meohi2&#+nNCn<5H(eaMljL2GF5UVEmRQm$jV9_
zZL6t~YAH@j#JZ0&Gc~7Lu1r;8=ZKU~DVdd8(}_x)9A!4CWaa;ok*X|-)KZ(QkYh*X
z%qb|#1*KC^R?^CXvJM>8w0c$f<ZM<;^(jtkaM1A_74}4qxW27V4PGkZMvgv5IhvZ}
zr3q+x`AEwVQg*MWI`rz~=yfRZ`BzV72M?Zpd3W~1quCeV|Mt_v*};=%kM2BuaQE4F
z)7ia;&+~7`Z}d5jlZ|#ejMw&PupdNgySr=tFl_t1V6_`W?Sy*F?{|ao6=BbP7XqP=
z`|joqfwz;%FcN58ptl5iw`B>#^LYSMIv?)PNZ^4N^#s}y*xUB@g~7at3`3#sjwrh=
z^e*iMkn0ae5pQXt<PV2C0Zl}yPd!3eB8=E$UYGz+kZ^{eE@5nWJ2W8UOLOg${}J!S
zahUY3h?3mx(tV)_^j@pXP6H2nMZy^gXC<7Ia7w~?39m_bRl-F8f%)y23I_;XV00xP
zlo;><ok(rbjwp{};&JNoAf)4C&5u``;cAndZ+@`RyuQ)A)za9Tl1Lk|fpFqc7byBg
za?2{t7AB+6m4qik>ru`cGTVRyX+(r6sU@?hY_VY|y@Y#VEVN(}abZGlw&Y}M1vL6a
zJKe~GbbA>EWM1`!f_C0s$Ynh62h;^o$&34y3eDnz+<=JM;?@~W;$f6fq08cvd0T8R
zeqXj&5{Sxjj|*x&(U^zx+j3uQFE1vF7UF=@@NgYB$R73k9(Nal68Xm_9|B+%YpSZE
z`7oCDuX?2pKl{IH%aujmpY>`LnJPvmjMK>e8yoD}V@v%MVRlJT4*S{#oxKhGhX>kK
zowcR+tCpe7!;kRg#!U!YHTDje`FF7{*T2U1jcao1=HWU0;>i=>nu_Fi7Q<A{l5_bV
DXP18g

diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py
index 8aff8387..153e43a6 100644
--- a/dsLightRag/WxGzh/T2_GetArticleList.py
+++ b/dsLightRag/WxGzh/T2_GetArticleList.py
@@ -5,6 +5,7 @@
 # https://blog.csdn.net/yajuanpi4899/article/details/121584268
 
 
+import asyncio
 import datetime
 import json
 import logging
@@ -12,9 +13,10 @@ import random
 import re
 
 import requests
-import asyncio
+
 from Util.PostgreSQLUtil import init_postgres_pool
-from Util.WxGzhUtil import init_wechat_browser, get_wechat_articles
+from Util.WxGzhUtil import init_wechat_browser, get_article_content
+
 
 async def get_wechat_sources():
     """从t_wechat_source表获取微信公众号列表"""
@@ -109,7 +111,6 @@ if __name__ == '__main__':
     
     # 爬取文章
     for item in gzlist:
-        article_urls = get_wechat_articles(item["account_name"], item["account_id"], token, cookies, header)
         account_name = item["account_name"]
         account_id = item["account_id"]
         # 搜索微信公众号的接口地址
@@ -155,27 +156,15 @@ if __name__ == '__main__':
 
         for item in fakeid_list:
             # 采集item示例
-            new_article = {
-                'title': item.get('title'),
-                'article_url': item.get('link'),
-                'account_id': account_id,
-                'account_name': account_name,
-                'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime(
-                    '%Y-%m-%d %H:%M:%S'),
-                'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-            }
-            logging.info("new_article:", new_article)
-            article_urls.append({"title": item.get('title'), "url": item.get('link'),
-                                 "publish_time": datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime(
-                                     '%Y-%m-%d %H:%M:%S')})
+            article_url = item.get('link')
+            article_title = item.get('title')
+            publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')
+            
+            # 直接获取并显示文章内容
+            print(f"正在处理文章: {article_title} ({publish_time})")
+            content = get_article_content(article_url)
+            print(f"文章内容预览: {content[:200]}...")
+            
             time.sleep(1)
-
-    for x in article_urls:
-        print(x)
-    # 将返回的地址写入到文件
-    with open('article_urls.txt', 'w', encoding='utf-8') as f:
-        for record in article_urls:
-            f.write(record['title'] + " " + record['publish_time'] + " " + record['url'] + '\n')
-
         # 关闭浏览器
     driver.quit()