dsProject/dsLightRag/Util/WxGzhUtil.py

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By

def init_wechat_browser():
    """初始化微信爬虫浏览器实例"""
    options = Options()
    options.add_argument('-headless')
    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
    return webdriver.Chrome(service=service, options=options)


def get_article_content(url):
    """
    获取微信公众号文章内容
    :param url: 文章URL
    :return: 文章内容文本
    """
    options = Options()
    options.add_argument('-headless')
    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get(url)
        html_content = driver.find_element(By.CLASS_NAME, "rich_media").text

        # 处理内容，提取空行后的文本
        lines = html_content.split('\n')
        content_after_empty_line = ""
        found_empty_line = False

        for line in lines:
            if not found_empty_line and line.strip() == "":
                found_empty_line = True
                continue

            if found_empty_line:
                content_after_empty_line += line + "\n"

        if not found_empty_line:
            content_after_empty_line = html_content

        return content_after_empty_line.replace("\n\n", "\n")
    finally:
        driver.quit()