main
HuangHai 1 week ago
parent 2ad3154fe8
commit 24de098979

@ -1,6 +1,3 @@
import datetime
import random
import requests
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.chrome.service import Service as ChromeService
@ -13,56 +10,6 @@ def init_wechat_browser():
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
return webdriver.Chrome(service=service, options=options) return webdriver.Chrome(service=service, options=options)
def get_wechat_articles(account_name, account_id, token, cookies, header):
"""获取指定公众号的文章列表"""
article_urls = []
# 搜索微信公众号的接口地址
search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
query_id = {
'action': 'search_biz',
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'query': account_name,
'begin': '0',
'count': '5'
}
# 完整实现搜索和获取文章逻辑
search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
lists = search_response.json().get('list')[0]
fakeid = lists.get('fakeid')
# 微信公众号文章接口
appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
query_id_data = {
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'action': 'list_ex',
'begin': '0',
'count': '5',
'query': '',
'fakeid': fakeid,
'type': '9'
}
query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
fakeid_list = query_fakeid_response.json().get('app_msg_list')
for item in fakeid_list:
article_urls.append({
'title': item.get('title'),
'url': item.get('link'),
'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')
})
return article_urls
def get_article_content(url): def get_article_content(url):
""" """

@ -5,6 +5,7 @@
# https://blog.csdn.net/yajuanpi4899/article/details/121584268 # https://blog.csdn.net/yajuanpi4899/article/details/121584268
import asyncio
import datetime import datetime
import json import json
import logging import logging
@ -12,9 +13,10 @@ import random
import re import re
import requests import requests
import asyncio
from Util.PostgreSQLUtil import init_postgres_pool from Util.PostgreSQLUtil import init_postgres_pool
from Util.WxGzhUtil import init_wechat_browser, get_wechat_articles from Util.WxGzhUtil import init_wechat_browser, get_article_content
async def get_wechat_sources(): async def get_wechat_sources():
"""从t_wechat_source表获取微信公众号列表""" """从t_wechat_source表获取微信公众号列表"""
@ -109,7 +111,6 @@ if __name__ == '__main__':
# 爬取文章 # 爬取文章
for item in gzlist: for item in gzlist:
article_urls = get_wechat_articles(item["account_name"], item["account_id"], token, cookies, header)
account_name = item["account_name"] account_name = item["account_name"]
account_id = item["account_id"] account_id = item["account_id"]
# 搜索微信公众号的接口地址 # 搜索微信公众号的接口地址
@ -155,27 +156,15 @@ if __name__ == '__main__':
for item in fakeid_list: for item in fakeid_list:
# 采集item示例 # 采集item示例
new_article = { article_url = item.get('link')
'title': item.get('title'), article_title = item.get('title')
'article_url': item.get('link'), publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')
'account_id': account_id,
'account_name': account_name, # 直接获取并显示文章内容
'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime( print(f"正在处理文章: {article_title} ({publish_time})")
'%Y-%m-%d %H:%M:%S'), content = get_article_content(article_url)
'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f"文章内容预览: {content[:200]}...")
}
logging.info("new_article:", new_article)
article_urls.append({"title": item.get('title'), "url": item.get('link'),
"publish_time": datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime(
'%Y-%m-%d %H:%M:%S')})
time.sleep(1) time.sleep(1)
for x in article_urls:
print(x)
# 将返回的地址写入到文件
with open('article_urls.txt', 'w', encoding='utf-8') as f:
for record in article_urls:
f.write(record['title'] + " " + record['publish_time'] + " " + record['url'] + '\n')
# 关闭浏览器 # 关闭浏览器
driver.quit() driver.quit()

Loading…
Cancel
Save