|
|
|
@ -4,21 +4,36 @@
|
|
|
|
|
# 微信爬爬猫---公众号文章抓取代码分析
|
|
|
|
|
# https://blog.csdn.net/yajuanpi4899/article/details/121584268
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# 查看selenium版本
|
|
|
|
|
pip show selenium
|
|
|
|
|
4.34.2
|
|
|
|
|
|
|
|
|
|
# 查看Chrome浏览器版本
|
|
|
|
|
chrome://version/
|
|
|
|
|
138.0.7204.101 (正式版本) (64 位)
|
|
|
|
|
|
|
|
|
|
# 下载驱动包
|
|
|
|
|
https://googlechromelabs.github.io/chrome-for-testing/
|
|
|
|
|
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
|
|
|
|
|
"""
|
|
|
|
|
import asyncio
|
|
|
|
|
import datetime
|
|
|
|
|
import json
|
|
|
|
|
import logging
|
|
|
|
|
import random
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
from Util.LightRagUtil import initialize_pg_rag
|
|
|
|
|
from Util.PostgreSQLUtil import init_postgres_pool
|
|
|
|
|
from Util.WxGzhUtil import init_wechat_browser, get_article_content
|
|
|
|
|
|
|
|
|
|
# 删除重复的日志配置,只保留以下内容
|
|
|
|
|
logger = logging.getLogger('WeiXinGongZhongHao')
|
|
|
|
|
logger = logging.getLogger('WxGzh')
|
|
|
|
|
logger.setLevel(logging.INFO)
|
|
|
|
|
|
|
|
|
|
# 确保只添加一个handler
|
|
|
|
@ -27,6 +42,11 @@ if not logger.handlers:
|
|
|
|
|
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
|
|
|
|
logger.addHandler(handler)
|
|
|
|
|
|
|
|
|
|
# 添加微信请求头
|
|
|
|
|
header = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
|
|
|
|
|
'Referer': 'https://mp.weixin.qq.com/'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async def get_wechat_sources():
|
|
|
|
|
"""从t_wechat_source表获取微信公众号列表"""
|
|
|
|
@ -39,24 +59,6 @@ async def get_wechat_sources():
|
|
|
|
|
await pool.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# 查看selenium版本
|
|
|
|
|
pip show selenium
|
|
|
|
|
4.34.2
|
|
|
|
|
|
|
|
|
|
# 查看Chrome浏览器版本
|
|
|
|
|
chrome://version/
|
|
|
|
|
138.0.7204.101 (正式版本) (64 位)
|
|
|
|
|
|
|
|
|
|
# 下载驱动包
|
|
|
|
|
https://googlechromelabs.github.io/chrome-for-testing/
|
|
|
|
|
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
|
|
|
|
|
"""
|
|
|
|
|
import time
|
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def is_article_exist(pool, article_url):
|
|
|
|
|
"""检查文章URL是否已存在数据库中"""
|
|
|
|
|
try:
|
|
|
|
@ -72,12 +74,23 @@ async def is_article_exist(pool, article_url):
|
|
|
|
|
return False # 出错时默认返回False,避免影响正常流程
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, id):
|
|
|
|
|
# 先检查文章是否已存在
|
|
|
|
|
if await is_article_exist(pool, article_url):
|
|
|
|
|
logger.info(f"文章已存在,跳过保存: {article_url}")
|
|
|
|
|
return
|
|
|
|
|
async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, source_id):
|
|
|
|
|
|
|
|
|
|
# 在这里调用 lightrag进行知识库构建
|
|
|
|
|
workspace = 'ChangChun'
|
|
|
|
|
# 使用PG库后,这个是没有用的,但目前的项目代码要求必传,就写一个吧。
|
|
|
|
|
WORKING_DIR = f"./output"
|
|
|
|
|
docx_name = f"{account_name}_{article_title}" # 组合来源和标题作为文档名
|
|
|
|
|
logger.info(f"开始处理文档: {docx_name}")
|
|
|
|
|
try:
|
|
|
|
|
rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace)
|
|
|
|
|
await rag.ainsert(input=content, file_paths=[docx_name])
|
|
|
|
|
finally:
|
|
|
|
|
if rag:
|
|
|
|
|
await rag.finalize_storages()
|
|
|
|
|
if pool:
|
|
|
|
|
await pool.close()
|
|
|
|
|
logger.info(f"保存文档到知识库成功: {docx_name}")
|
|
|
|
|
try:
|
|
|
|
|
async with pool.acquire() as conn:
|
|
|
|
|
await conn.execute('''
|
|
|
|
@ -85,139 +98,155 @@ async def save_article_to_db(pool, article_title, account_name, article_url, pub
|
|
|
|
|
(title, source, url, publish_time, content, source_id)
|
|
|
|
|
VALUES ($1, $2, $3, $4, $5, $6)
|
|
|
|
|
''', article_title, account_name, article_url,
|
|
|
|
|
publish_time, content, id)
|
|
|
|
|
publish_time, content, source_id)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.error(f"保存文章失败: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
# 从文件cookies.txt中获取
|
|
|
|
|
async def initialize_wechat_session():
|
|
|
|
|
"""初始化微信会话,获取cookies和token"""
|
|
|
|
|
with open('cookies.txt', 'r', encoding='utf-8') as f:
|
|
|
|
|
content = f.read()
|
|
|
|
|
# 使用json还原为json对象
|
|
|
|
|
cookies = json.loads(content)
|
|
|
|
|
# "expiry": 1787106233
|
|
|
|
|
# 检查是否有过期时间
|
|
|
|
|
global driver # 添加这行
|
|
|
|
|
expiry = cookies["expiry"]
|
|
|
|
|
if expiry:
|
|
|
|
|
# 换算出过期时间
|
|
|
|
|
expiry_time = time.localtime(expiry)
|
|
|
|
|
expiry_date = time.strftime("%Y-%m-%d %H:%M:%S", expiry_time)
|
|
|
|
|
|
|
|
|
|
# 获取当前时间戳
|
|
|
|
|
current_timestamp = time.time()
|
|
|
|
|
# 检查是否已过期
|
|
|
|
|
if current_timestamp > expiry:
|
|
|
|
|
logger.error("Cookie已过期")
|
|
|
|
|
exit()
|
|
|
|
|
# 移除expiry属性
|
|
|
|
|
|
|
|
|
|
del cookies["expiry"]
|
|
|
|
|
logger.info(f"cookies的过期时间一般是4天,cookies过期时间:%s" % expiry_date)
|
|
|
|
|
options = Options()
|
|
|
|
|
options.add_argument('-headless') # 无头参数,调试时可以注释掉
|
|
|
|
|
# 设置headers - 使用微信内置浏览器的User-Agent
|
|
|
|
|
header = {
|
|
|
|
|
"HOST": "mp.weixin.qq.com",
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)",
|
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
|
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",
|
|
|
|
|
"Connection": "keep-alive"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
|
|
|
|
|
# 使用统一的初始化方式
|
|
|
|
|
options = Options()
|
|
|
|
|
options.add_argument('-headless')
|
|
|
|
|
driver = init_wechat_browser()
|
|
|
|
|
|
|
|
|
|
# 方法3:使用requests库发送请求获取重定向URL
|
|
|
|
|
url = 'https://mp.weixin.qq.com'
|
|
|
|
|
response = requests.get(url=url, allow_redirects=False, cookies=cookies)
|
|
|
|
|
|
|
|
|
|
if 'Location' in response.headers:
|
|
|
|
|
redirect_url = response.headers.get("Location")
|
|
|
|
|
logger.info(f"重定向URL:%s"%redirect_url)
|
|
|
|
|
token_match = re.findall(r'token=(\d+)', redirect_url)
|
|
|
|
|
if token_match:
|
|
|
|
|
token = token_match[0]
|
|
|
|
|
logger.info(f"获取到的token:%s"%token)
|
|
|
|
|
return cookies, token
|
|
|
|
|
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def get_wechat_account_list(cookies, token, account_name):
|
|
|
|
|
"""获取指定公众号的fakeid"""
|
|
|
|
|
search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
|
|
|
|
|
query_id = {
|
|
|
|
|
'action': 'search_biz',
|
|
|
|
|
'token': token,
|
|
|
|
|
'lang': 'zh_CN',
|
|
|
|
|
'f': 'json',
|
|
|
|
|
'ajax': '1',
|
|
|
|
|
'random': random.random(),
|
|
|
|
|
'query': account_name,
|
|
|
|
|
'begin': '0',
|
|
|
|
|
'count': '5'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
|
|
|
|
|
lists = search_response.json().get('list')[0]
|
|
|
|
|
return lists.get('fakeid')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def get_article_list(cookies, token, fakeid):
|
|
|
|
|
"""获取公众号文章列表"""
|
|
|
|
|
appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
|
|
|
|
|
query_id_data = {
|
|
|
|
|
'token': token,
|
|
|
|
|
'lang': 'zh_CN',
|
|
|
|
|
'f': 'json',
|
|
|
|
|
'ajax': '1',
|
|
|
|
|
'random': random.random(),
|
|
|
|
|
'action': 'list_ex',
|
|
|
|
|
'begin': '0',
|
|
|
|
|
'count': '5',
|
|
|
|
|
'query': '',
|
|
|
|
|
'fakeid': fakeid,
|
|
|
|
|
'type': '9'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
|
|
|
|
|
return query_fakeid_response.json().get('app_msg_list')
|
|
|
|
|
|
|
|
|
|
article_urls = []
|
|
|
|
|
|
|
|
|
|
# 获取公众号列表
|
|
|
|
|
async def process_single_article(article_info, account_info, cookies, token):
|
|
|
|
|
"""处理单篇文章"""
|
|
|
|
|
article_url = article_info.get('link')
|
|
|
|
|
article_title = article_info.get('title')
|
|
|
|
|
publish_time = datetime.datetime.fromtimestamp(int(article_info.get("update_time")))
|
|
|
|
|
|
|
|
|
|
if '试卷' in article_title:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
pool = await init_postgres_pool()
|
|
|
|
|
# 先检查文章是否已存在
|
|
|
|
|
if await is_article_exist(pool, article_url):
|
|
|
|
|
logger.info(f'文章已存在,跳过保存: {account_info["account_name"]}-{article_title}')
|
|
|
|
|
return False
|
|
|
|
|
content = get_article_content(article_url)
|
|
|
|
|
await save_article_to_db(pool, article_title, account_info["account_name"],
|
|
|
|
|
article_url, publish_time, content, account_info["id"])
|
|
|
|
|
return True
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"处理文章时出错: {e}")
|
|
|
|
|
return False
|
|
|
|
|
finally:
|
|
|
|
|
if 'pool' in locals():
|
|
|
|
|
await pool.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def process_wechat_account(account_info, cookies, token):
|
|
|
|
|
"""处理单个公众号的所有文章"""
|
|
|
|
|
cnt = 0
|
|
|
|
|
fakeid = await get_wechat_account_list(cookies, token, account_info["account_name"])
|
|
|
|
|
articles = await get_article_list(cookies, token, fakeid)
|
|
|
|
|
|
|
|
|
|
for article in articles:
|
|
|
|
|
success = await process_single_article(article, account_info, cookies, token)
|
|
|
|
|
if success:
|
|
|
|
|
cnt += 1
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
logger.info(f"成功获取公众号: {account_info['account_name']} {cnt}篇文章。")
|
|
|
|
|
return cnt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def main():
|
|
|
|
|
"""主函数"""
|
|
|
|
|
while True:
|
|
|
|
|
try:
|
|
|
|
|
logger.info("开始执行微信公众号文章采集任务")
|
|
|
|
|
cookies, token = await initialize_wechat_session()
|
|
|
|
|
if not cookies or not token:
|
|
|
|
|
logger.error("初始化微信会话失败")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
account_list = await get_wechat_sources()
|
|
|
|
|
for account in account_list:
|
|
|
|
|
await process_wechat_account(account, cookies, token)
|
|
|
|
|
|
|
|
|
|
logger.info("本次采集任务完成,等待30分钟后再次执行")
|
|
|
|
|
await asyncio.sleep(30 * 60) # 30分钟
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"主循环发生错误: {e}")
|
|
|
|
|
await asyncio.sleep(30 * 60) # 出错后也等待30分钟
|
|
|
|
|
finally:
|
|
|
|
|
if 'driver' in globals():
|
|
|
|
|
driver.quit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
loop = asyncio.new_event_loop()
|
|
|
|
|
asyncio.set_event_loop(loop)
|
|
|
|
|
try:
|
|
|
|
|
gzlist = loop.run_until_complete(get_wechat_sources())
|
|
|
|
|
loop.run_until_complete(main())
|
|
|
|
|
finally:
|
|
|
|
|
loop.close()
|
|
|
|
|
|
|
|
|
|
# 爬取文章
|
|
|
|
|
for item in gzlist:
|
|
|
|
|
account_name = item["account_name"]
|
|
|
|
|
account_id = item["account_id"]
|
|
|
|
|
id = item["id"]
|
|
|
|
|
# 搜索微信公众号的接口地址
|
|
|
|
|
search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
|
|
|
|
|
# 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字
|
|
|
|
|
query_id = {
|
|
|
|
|
'action': 'search_biz',
|
|
|
|
|
'token': token,
|
|
|
|
|
'lang': 'zh_CN',
|
|
|
|
|
'f': 'json',
|
|
|
|
|
'ajax': '1',
|
|
|
|
|
'random': random.random(),
|
|
|
|
|
'query': account_name,
|
|
|
|
|
'begin': '0',
|
|
|
|
|
'count': '5'
|
|
|
|
|
}
|
|
|
|
|
# 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers
|
|
|
|
|
search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
|
|
|
|
|
# 取搜索结果中的第一个公众号
|
|
|
|
|
lists = search_response.json().get('list')[0]
|
|
|
|
|
# 获取这个公众号的fakeid,后面爬取公众号文章需要此字段
|
|
|
|
|
fakeid = lists.get('fakeid')
|
|
|
|
|
logging.info("fakeid:" + fakeid)
|
|
|
|
|
# 微信公众号文章接口地址
|
|
|
|
|
appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
|
|
|
|
|
# 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random
|
|
|
|
|
query_id_data = {
|
|
|
|
|
'token': token,
|
|
|
|
|
'lang': 'zh_CN',
|
|
|
|
|
'f': 'json',
|
|
|
|
|
'ajax': '1',
|
|
|
|
|
'random': random.random(),
|
|
|
|
|
'action': 'list_ex',
|
|
|
|
|
'begin': '0', # 不同页,此参数变化,变化规则为每页加5
|
|
|
|
|
'count': '5',
|
|
|
|
|
'query': '',
|
|
|
|
|
'fakeid': fakeid,
|
|
|
|
|
'type': '9'
|
|
|
|
|
}
|
|
|
|
|
# 打开搜索的微信公众号文章列表页
|
|
|
|
|
query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
|
|
|
|
|
fakeid_list = query_fakeid_response.json().get('app_msg_list')
|
|
|
|
|
|
|
|
|
|
for item in fakeid_list:
|
|
|
|
|
article_url = item.get('link')
|
|
|
|
|
article_title = item.get('title')
|
|
|
|
|
publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time")))
|
|
|
|
|
|
|
|
|
|
if '试卷' in article_title: # 过滤掉试卷
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
logger.info(f"正在处理文章: {article_title} ({publish_time})")
|
|
|
|
|
content = get_article_content(article_url)
|
|
|
|
|
|
|
|
|
|
loop = asyncio.new_event_loop()
|
|
|
|
|
asyncio.set_event_loop(loop)
|
|
|
|
|
try:
|
|
|
|
|
pool = loop.run_until_complete(init_postgres_pool())
|
|
|
|
|
loop.run_until_complete(
|
|
|
|
|
save_article_to_db(pool, article_title, account_name, article_url, publish_time, content,
|
|
|
|
|
id))
|
|
|
|
|
finally:
|
|
|
|
|
loop.run_until_complete(pool.close())
|
|
|
|
|
loop.close()
|
|
|
|
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
# 关闭浏览器
|
|
|
|
|
driver.quit()
|
|
|
|
|