import datetime import random import requests from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.common.by import By def init_wechat_browser(): """初始化微信爬虫浏览器实例""" options = Options() options.add_argument('-headless') service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") return webdriver.Chrome(service=service, options=options) def get_wechat_articles(account_name, account_id, token, cookies, header): """获取指定公众号的文章列表""" article_urls = [] # 搜索微信公众号的接口地址 search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' query_id = { 'action': 'search_biz', 'token': token, 'lang': 'zh_CN', 'f': 'json', 'ajax': '1', 'random': random.random(), 'query': account_name, 'begin': '0', 'count': '5' } # 完整实现搜索和获取文章逻辑 search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) lists = search_response.json().get('list')[0] fakeid = lists.get('fakeid') # 微信公众号文章接口 appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' query_id_data = { 'token': token, 'lang': 'zh_CN', 'f': 'json', 'ajax': '1', 'random': random.random(), 'action': 'list_ex', 'begin': '0', 'count': '5', 'query': '', 'fakeid': fakeid, 'type': '9' } query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) fakeid_list = query_fakeid_response.json().get('app_msg_list') for item in fakeid_list: article_urls.append({ 'title': item.get('title'), 'url': item.get('link'), 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S') }) return article_urls def get_article_content(url): """ 获取微信公众号文章内容 :param url: 文章URL :return: 文章内容文本 """ options = Options() options.add_argument('-headless') service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") driver = webdriver.Chrome(service=service, options=options) try: driver.get(url) html_content = driver.find_element(By.CLASS_NAME, "rich_media").text # 处理内容,提取空行后的文本 lines = html_content.split('\n') content_after_empty_line = "" found_empty_line = False for line in lines: if not found_empty_line and line.strip() == "": found_empty_line = True continue if found_empty_line: content_after_empty_line += line + "\n" if not found_empty_line: content_after_empty_line = html_content return content_after_empty_line.replace("\n\n", "\n") finally: driver.quit()