dsProject/dsLightRag/WxGzh/Util/WxGzhUtil.py

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By

def get_article_content(url):
    """
    获取微信公众号文章内容
    :param url: 文章URL
    :return: 文章内容文本
    """
    options = Options()
    options.add_argument('-headless')
    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)
    
    try:
        driver.get(url)
        html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
        
        # 处理内容，提取空行后的文本
        lines = html_content.split('\n')
        content_after_empty_line = ""
        found_empty_line = False

        for line in lines:
            if not found_empty_line and line.strip() == "":
                found_empty_line = True
                continue

            if found_empty_line:
                content_after_empty_line += line + "\n"

        if not found_empty_line:
            content_after_empty_line = html_content

        return content_after_empty_line.replace("\n\n", "\n")
    finally:
        driver.quit()

if __name__ == '__main__':
    # 示例用法
    url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd'
    content = get_article_content(url)
    print(content)
'commit' 1 week ago			`from selenium import webdriver`
			`from selenium.webdriver.chrome.options import Options`
			`from selenium.webdriver.chrome.service import Service as ChromeService`
			`from selenium.webdriver.common.by import By`

			`def get_article_content(url):`
			`"""`
			`获取微信公众号文章内容`
			`:param url: 文章URL`
			`:return: 文章内容文本`
			`"""`
			`options = Options()`
			`options.add_argument('-headless')`
			`service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")`
			`driver = webdriver.Chrome(service=service, options=options)`

			`try:`
			`driver.get(url)`
			`html_content = driver.find_element(By.CLASS_NAME, "rich_media").text`

			`# 处理内容，提取空行后的文本`
			`lines = html_content.split('\n')`
			`content_after_empty_line = ""`
			`found_empty_line = False`

			`for line in lines:`
			`if not found_empty_line and line.strip() == "":`
			`found_empty_line = True`
			`continue`

			`if found_empty_line:`
			`content_after_empty_line += line + "\n"`

			`if not found_empty_line:`
			`content_after_empty_line = html_content`

			`return content_after_empty_line.replace("\n\n", "\n")`
			`finally:`
			`driver.quit()`

			`if __name__ == '__main__':`
			`# 示例用法`
			`url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd'`
			`content = get_article_content(url)`
			`print(content)`