dsProject/dsLightRag/Util/WxGzhUtil.py

import datetime
import random
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By

def init_wechat_browser():
    """初始化微信爬虫浏览器实例"""
    options = Options()
    options.add_argument('-headless')
    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
    return webdriver.Chrome(service=service, options=options)

def get_wechat_articles(account_name, account_id, token, cookies, header):
    """获取指定公众号的文章列表"""
    article_urls = []
    
    # 搜索微信公众号的接口地址
    search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
    query_id = {
        'action': 'search_biz',
        'token': token,
        'lang': 'zh_CN',
        'f': 'json',
        'ajax': '1',
        'random': random.random(),
        'query': account_name,
        'begin': '0',
        'count': '5'
    }
    
    # 完整实现搜索和获取文章逻辑
    search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
    lists = search_response.json().get('list')[0]
    fakeid = lists.get('fakeid')
    
    # 微信公众号文章接口
    appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
    query_id_data = {
        'token': token,
        'lang': 'zh_CN',
        'f': 'json',
        'ajax': '1',
        'random': random.random(),
        'action': 'list_ex',
        'begin': '0',
        'count': '5',
        'query': '',
        'fakeid': fakeid,
        'type': '9'
    }
    
    query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
    fakeid_list = query_fakeid_response.json().get('app_msg_list')
    
    for item in fakeid_list:
        article_urls.append({
            'title': item.get('title'),
            'url': item.get('link'),
            'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')
        })
    
    return article_urls

def get_article_content(url):
    """
    获取微信公众号文章内容
    :param url: 文章URL
    :return: 文章内容文本
    """
    options = Options()
    options.add_argument('-headless')
    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)
    
    try:
        driver.get(url)
        html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
        
        # 处理内容，提取空行后的文本
        lines = html_content.split('\n')
        content_after_empty_line = ""
        found_empty_line = False

        for line in lines:
            if not found_empty_line and line.strip() == "":
                found_empty_line = True
                continue

            if found_empty_line:
                content_after_empty_line += line + "\n"

        if not found_empty_line:
            content_after_empty_line = html_content

        return content_after_empty_line.replace("\n\n", "\n")
    finally:
        driver.quit()
'commit' 1 week ago			`import datetime`
			`import random`
			`import requests`
			`from selenium import webdriver`
			`from selenium.webdriver.chrome.options import Options`
			`from selenium.webdriver.chrome.service import Service as ChromeService`
			`from selenium.webdriver.common.by import By`

			`def init_wechat_browser():`
			`"""初始化微信爬虫浏览器实例"""`
			`options = Options()`
			`options.add_argument('-headless')`
			`service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")`
			`return webdriver.Chrome(service=service, options=options)`

			`def get_wechat_articles(account_name, account_id, token, cookies, header):`
			`"""获取指定公众号的文章列表"""`
			`article_urls = []`

			`# 搜索微信公众号的接口地址`
			`search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'`
			`query_id = {`
			`'action': 'search_biz',`
			`'token': token,`
			`'lang': 'zh_CN',`
			`'f': 'json',`
			`'ajax': '1',`
			`'random': random.random(),`
			`'query': account_name,`
			`'begin': '0',`
			`'count': '5'`
			`}`

			`# 完整实现搜索和获取文章逻辑`
			`search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)`
			`lists = search_response.json().get('list')[0]`
			`fakeid = lists.get('fakeid')`

			`# 微信公众号文章接口`
			`appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'`
			`query_id_data = {`
			`'token': token,`
			`'lang': 'zh_CN',`
			`'f': 'json',`
			`'ajax': '1',`
			`'random': random.random(),`
			`'action': 'list_ex',`
			`'begin': '0',`
			`'count': '5',`
			`'query': '',`
			`'fakeid': fakeid,`
			`'type': '9'`
			`}`

			`query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)`
			`fakeid_list = query_fakeid_response.json().get('app_msg_list')`

			`for item in fakeid_list:`
			`article_urls.append({`
			`'title': item.get('title'),`
			`'url': item.get('link'),`
			`'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')`
			`})`

			`return article_urls`

			`def get_article_content(url):`
			`"""`
			`获取微信公众号文章内容`
			`:param url: 文章URL`
			`:return: 文章内容文本`
			`"""`
			`options = Options()`
			`options.add_argument('-headless')`
			`service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")`
			`driver = webdriver.Chrome(service=service, options=options)`

			`try:`
			`driver.get(url)`
			`html_content = driver.find_element(By.CLASS_NAME, "rich_media").text`

			`# 处理内容，提取空行后的文本`
			`lines = html_content.split('\n')`
			`content_after_empty_line = ""`
			`found_empty_line = False`

			`for line in lines:`
			`if not found_empty_line and line.strip() == "":`
			`found_empty_line = True`
			`continue`

			`if found_empty_line:`
			`content_after_empty_line += line + "\n"`

			`if not found_empty_line:`
			`content_after_empty_line = html_content`

			`return content_after_empty_line.replace("\n\n", "\n")`
			`finally:`
			`driver.quit()`