HuangHai 4 days ago
commit 9bda432e86

@ -1,12 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="D:\anaconda3\envs\py310" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<module version="4">
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
</component>
</module>

@ -3,5 +3,5 @@
<component name="Black">
<option name="sdkName" value="D:\anaconda3\envs\lightrag" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (5)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
</project>

@ -537,5 +537,18 @@
"create_time": 1752818371,
"update_time": 1752818371,
"_id": "hybrid:keywords:37a1721ae1b1e0a8858a85a94a139d9a"
},
"hybrid:keywords:6d5e4e5a7a264a947d2e6e9cb8befec7": {
"return": "{\"high_level_keywords\": [\"\\u51e0\\u4f55\\u8bc1\\u660e\", \"\\u4e09\\u89d2\\u5f62\", \"\\u5185\\u90e8\\u70b9\", \"\\u89d2\\u5ea6\\u5173\\u7cfb\"], \"low_level_keywords\": [\"\\u4e09\\u89d2\\u5f62ABC\", \"\\u70b9P\", \"\\u2220BPC\", \"\\u2220A\"]}",
"cache_type": "keywords",
"chunk_id": null,
"embedding": null,
"embedding_shape": null,
"embedding_min": null,
"embedding_max": null,
"original_prompt": "求证在三角形ABC中P为其内部任意一点。请证明∠BPC > ∠A。",
"create_time": 1752821457,
"update_time": 1752821457,
"_id": "hybrid:keywords:6d5e4e5a7a264a947d2e6e9cb8befec7"
}
}

@ -1,78 +0,0 @@
# 详解Python + Selenium 批量采集微信公众号搭建自己的微信公众号每日AI简报告别信息焦虑
# https://blog.csdn.net/k352733625/article/details/149222945
# 微信爬爬猫---公众号文章抓取代码分析
# https://blog.csdn.net/yajuanpi4899/article/details/121584268
import json
import logging
from torch.distributed.elastic.timer import expires
"""
# 查看selenium版本
pip show selenium
4.34.2
# 查看Chrome浏览器版本
chrome://version/
138.0.7204.101 (正式版本) 64
# 下载驱动包
https://googlechromelabs.github.io/chrome-for-testing/
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
"""
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
if __name__ == '__main__':
# 定义一个空的字典存放cookies内容
cookies = {}
# 设置headers - 使用微信内置浏览器的User-Agent
header = {
"HOST": "mp.weixin.qq.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",
"Connection": "keep-alive"
}
# 用webdriver启动谷歌浏览器
logging.info("启动浏览器,打开微信公众号登录界面")
options = Options()
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
# 打开微信公众号登录页面
driver.get('https://mp.weixin.qq.com/')
# 等待5秒钟
time.sleep(2)
# # 拿手机扫二维码!
logging.info("请拿手机扫码二维码登录公众号")
time.sleep(20)
# 重新载入公众号登录页登录之后会显示公众号后台首页从这个返回内容中获取cookies信息
driver.get('https://mp.weixin.qq.com/')
# 获取cookies
cookie_items = driver.get_cookies()
expiry=-1
# 获取到的cookies是列表形式将cookies转成json形式并存入本地名为cookie的文本中
for cookie_item in cookie_items:
cookies[cookie_item['name']] = cookie_item['value']
if('expiry' in cookie_item and cookie_item['expiry'] > expiry):
expiry = cookie_item['expiry']
if "slave_sid" not in cookies:
logging.info("登录公众号失败获取cookie失败")
exit()
# 将cookies写入文件
cookies["expiry"] = expiry
with open('cookies.txt', mode='w', encoding="utf-8") as f:
f.write(json.dumps(cookies, indent=4, ensure_ascii=False))
# 关闭浏览器
driver.quit()
# 输出提示
print("成功获取了cookies内容")

@ -1,251 +0,0 @@
# 详解Python + Selenium 批量采集微信公众号搭建自己的微信公众号每日AI简报告别信息焦虑
# https://blog.csdn.net/k352733625/article/details/149222945
# 微信爬爬猫---公众号文章抓取代码分析
# https://blog.csdn.net/yajuanpi4899/article/details/121584268
"""
# 查看selenium版本
pip show selenium
4.34.2
# 查看Chrome浏览器版本
chrome://version/
138.0.7204.101 (正式版本) 64
# 下载驱动包
https://googlechromelabs.github.io/chrome-for-testing/
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
"""
import asyncio
import datetime
import json
import logging
import random
import re
import time
import requests
from selenium.webdriver.chrome.options import Options
from Util.LightRagUtil import initialize_pg_rag, initialize_rag
from Util.PostgreSQLUtil import init_postgres_pool
from Util.WxGzhUtil import init_wechat_browser, get_article_content
# 删除重复的日志配置,只保留以下内容
logger = logging.getLogger('WxGzh')
logger.setLevel(logging.INFO)
# 确保只添加一个handler
if not logger.handlers:
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
# 添加微信请求头
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
'Referer': 'https://mp.weixin.qq.com/'
}
async def get_wechat_sources():
"""从t_wechat_source表获取微信公众号列表"""
try:
pool = await init_postgres_pool()
async with pool.acquire() as conn:
rows = await conn.fetch('SELECT * FROM t_wechat_source')
return [dict(row) for row in rows]
finally:
await pool.close()
async def is_article_exist(pool, article_url):
"""检查文章URL是否已存在数据库中"""
try:
async with pool.acquire() as conn:
row = await conn.fetchrow('''
SELECT 1
FROM t_wechat_articles
WHERE url = $1 LIMIT 1
''', article_url)
return row is not None
except Exception as e:
logging.error(f"检查文章存在性失败: {e}")
return False # 出错时默认返回False避免影响正常流程
async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, source_id):
# 在这里调用 lightrag进行知识库构建
WORKING_DIR = f"../Topic/ChangChun"
docx_name = f"{account_name}_{article_title}" # 组合来源和标题作为文档名
logger.info(f"开始处理文档: {docx_name}")
try:
# 注意默认设置使用NetworkX
rag = await initialize_rag(WORKING_DIR)
await rag.ainsert(content)
logger.info(f"索引完成: {docx_name}")
except Exception as e:
print(f"An error occurred: {e}")
finally:
await rag.finalize_storages()
try:
async with pool.acquire() as conn:
await conn.execute('''
INSERT INTO t_wechat_articles
(title, source, url, publish_time, content, source_id)
VALUES ($1, $2, $3, $4, $5, $6)
''', article_title, account_name, article_url,
publish_time, content, source_id)
logger.info(f"保存文档到知识库成功: {docx_name}")
except Exception as e:
logging.error(f"保存文章失败: {e}")
async def initialize_wechat_session():
"""初始化微信会话获取cookies和token"""
with open('cookies.txt', 'r', encoding='utf-8') as f:
content = f.read()
cookies = json.loads(content)
global driver # 添加这行
expiry = cookies["expiry"]
if expiry:
current_timestamp = time.time()
if current_timestamp > expiry:
logger.error("Cookie已过期")
exit()
del cookies["expiry"]
options = Options()
options.add_argument('-headless')
driver = init_wechat_browser()
url = 'https://mp.weixin.qq.com'
response = requests.get(url=url, allow_redirects=False, cookies=cookies)
if 'Location' in response.headers:
redirect_url = response.headers.get("Location")
token_match = re.findall(r'token=(\d+)', redirect_url)
if token_match:
token = token_match[0]
return cookies, token
return None, None
async def get_wechat_account_list(cookies, token, account_name):
"""获取指定公众号的fakeid"""
search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
query_id = {
'action': 'search_biz',
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'query': account_name,
'begin': '0',
'count': '5'
}
search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
lists = search_response.json().get('list')[0]
return lists.get('fakeid')
async def get_article_list(cookies, token, fakeid):
"""获取公众号文章列表"""
appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
query_id_data = {
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'action': 'list_ex',
'begin': '0',
'count': '5',
'query': '',
'fakeid': fakeid,
'type': '9'
}
query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
return query_fakeid_response.json().get('app_msg_list')
async def process_single_article(article_info, account_info, cookies, token):
"""处理单篇文章"""
article_url = article_info.get('link')
article_title = article_info.get('title')
publish_time = datetime.datetime.fromtimestamp(int(article_info.get("update_time")))
if '试卷' in article_title:
return False
try:
pool = await init_postgres_pool()
# 先检查文章是否已存在
if await is_article_exist(pool, article_url):
logger.info(f'文章已存在,跳过保存: {account_info["account_name"]}-{article_title}')
return False
content = get_article_content(article_url)
await save_article_to_db(pool, article_title, account_info["account_name"],
article_url, publish_time, content, account_info["id"])
return True
except Exception as e:
logger.error(f"处理文章时出错: {e}")
return False
finally:
if 'pool' in locals():
await pool.close()
async def process_wechat_account(account_info, cookies, token):
"""处理单个公众号的所有文章"""
cnt = 0
fakeid = await get_wechat_account_list(cookies, token, account_info["account_name"])
articles = await get_article_list(cookies, token, fakeid)
for article in articles:
success = await process_single_article(article, account_info, cookies, token)
if success:
cnt += 1
time.sleep(1)
logger.info(f"成功获取公众号: {account_info['account_name']} {cnt}篇文章。")
return cnt
async def main():
"""主函数"""
while True:
try:
logger.info("开始执行微信公众号文章采集任务")
cookies, token = await initialize_wechat_session()
if not cookies or not token:
logger.error("初始化微信会话失败")
continue
account_list = await get_wechat_sources()
for account in account_list:
await process_wechat_account(account, cookies, token)
logger.info("本次采集任务完成等待30分钟后再次执行")
await asyncio.sleep(30 * 60) # 30分钟
except Exception as e:
logger.error(f"主循环发生错误: {e}")
await asyncio.sleep(30 * 60) # 出错后也等待30分钟
finally:
if 'driver' in globals():
driver.quit()
if __name__ == '__main__':
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(main())
finally:
loop.close()

@ -1,17 +0,0 @@
{
"_clsk": "2gtve8|1752546228205|1|1|mp.weixin.qq.com/weheat-agent/payload/record",
"xid": "16332bed01be1055e236ad45b33af8df",
"data_bizuin": "3514353238",
"slave_user": "gh_4f88a4e194da",
"slave_sid": "QzBRX1FWTXNMaEdJYnc4ODBaM3FJU3RRbjVJNFE2N2IzMXFyVGlRQ0V5YklvNGFOc3NBWHdjV2J5OVg5U0JBVXdfdGhSU3lObXRheG1TdFUyXzVFcTFYS3E1NTh2aTlnSlBOOUluMUljUnBkYktjeUJDM216WVJNYzJKQkx2eW9Ib1duUk1yWXI3RndTa2dK",
"rand_info": "CAESIFwUSYus3XR5tFa1+b5ytJeuGAQS02d07zNBJNfi+Ftk",
"data_ticket": "9gQ088/vC7+jqxfFxBKS2aRx/JjmzJt+8HyuDLJtQBgpVej1hfSG1A0FQKWBbHQh",
"bizuin": "3514353238",
"mm_lang": "zh_CN",
"slave_bizuin": "3514353238",
"uuid": "8c5dc8e06af66d00a4b8e8596c8662eb",
"ua_id": "y1HZNMSzYCWuaUJDAAAAAApPVJ0a_arX_A5zqoUh6P8=",
"wxuin": "52546211515015",
"_clck": "msq32d|1|fxm|0",
"expiry": 1787106233
}

Binary file not shown.
Loading…
Cancel
Save