From af3f8098c4567cd48bf630a6c535fe4d313a2576 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:04:40 +0800 Subject: [PATCH] 'commit' --- dsLightRag/WxGzh/T3_GetArticle.py | 41 --------------------------- dsLightRag/WxGzh/Util/WxGzhUtil.py | 45 ++++++++++++++++++++++++++++++ dsLightRag/WxGzh/Util/__init__.py | 0 3 files changed, 45 insertions(+), 41 deletions(-) delete mode 100644 dsLightRag/WxGzh/T3_GetArticle.py create mode 100644 dsLightRag/WxGzh/Util/WxGzhUtil.py create mode 100644 dsLightRag/WxGzh/Util/__init__.py diff --git a/dsLightRag/WxGzh/T3_GetArticle.py b/dsLightRag/WxGzh/T3_GetArticle.py deleted file mode 100644 index 2f9c9e27..00000000 --- a/dsLightRag/WxGzh/T3_GetArticle.py +++ /dev/null @@ -1,41 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.chrome.service import Service as ChromeService -from selenium.webdriver.common.by import By - -url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd' - -options = Options() -options.add_argument('-headless') # 无头参数,调试时可以注释掉 -service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") -driver = webdriver.Chrome(service=service, options=options) -driver.get(url) -# 可以只要txt -html_content = driver.find_element(By.CLASS_NAME, "rich_media").text -# 第一行是标题,分离出来 -title = html_content.split('\n')[0] -print(title) - -# 按行遍历html_content,当发现空行时,删除空行前面的内容,只保留后面的内容 -lines = html_content.split('\n') -content_after_empty_line = "" -found_empty_line = False - -for line in lines: - if not found_empty_line and line.strip() == "": - # 找到第一个空行 - found_empty_line = True - continue - - if found_empty_line: - # 空行后的内容添加到结果中 - content_after_empty_line += line + "\n" - -# 如果没有找到空行,保留原始内容 -if not found_empty_line: - content_after_empty_line = html_content - -content_after_empty_line = content_after_empty_line.replace("\n\n", "\n") -print(content_after_empty_line) -# 关闭浏览器 -driver.quit() diff --git a/dsLightRag/WxGzh/Util/WxGzhUtil.py b/dsLightRag/WxGzh/Util/WxGzhUtil.py new file mode 100644 index 00000000..6b6d55b1 --- /dev/null +++ b/dsLightRag/WxGzh/Util/WxGzhUtil.py @@ -0,0 +1,45 @@ +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService +from selenium.webdriver.common.by import By + +def get_article_content(url): + """ + 获取微信公众号文章内容 + :param url: 文章URL + :return: 文章内容文本 + """ + options = Options() + options.add_argument('-headless') + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + driver = webdriver.Chrome(service=service, options=options) + + try: + driver.get(url) + html_content = driver.find_element(By.CLASS_NAME, "rich_media").text + + # 处理内容,提取空行后的文本 + lines = html_content.split('\n') + content_after_empty_line = "" + found_empty_line = False + + for line in lines: + if not found_empty_line and line.strip() == "": + found_empty_line = True + continue + + if found_empty_line: + content_after_empty_line += line + "\n" + + if not found_empty_line: + content_after_empty_line = html_content + + return content_after_empty_line.replace("\n\n", "\n") + finally: + driver.quit() + +if __name__ == '__main__': + # 示例用法 + url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd' + content = get_article_content(url) + print(content) diff --git a/dsLightRag/WxGzh/Util/__init__.py b/dsLightRag/WxGzh/Util/__init__.py new file mode 100644 index 00000000..e69de29b