'commit'

1 week ago · af3f8098c4
parent 352d2d71a4
commit af3f8098c4
3 changed files with 45 additions and 41 deletions
--- a/dsLightRag/WxGzh/T3_GetArticle.py
+++ b/dsLightRag/WxGzh/T3_GetArticle.py
@ -1,41 +0,0 @@
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.chrome.service import Service as ChromeService
-from selenium.webdriver.common.by import By
-
-url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd'
-
-options = Options()
-options.add_argument('-headless')  # 无头参数，调试时可以注释掉
-service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
-driver = webdriver.Chrome(service=service, options=options)
-driver.get(url)
-# 可以只要txt
-html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
-# 第一行是标题，分离出来
-title = html_content.split('\n')[0]
-print(title)
-
-# 按行遍历html_content，当发现空行时，删除空行前面的内容，只保留后面的内容
-lines = html_content.split('\n')
-content_after_empty_line = ""
-found_empty_line = False
-
-for line in lines:
-    if not found_empty_line and line.strip() == "":
-        # 找到第一个空行
-        found_empty_line = True
-        continue
-
-    if found_empty_line:
-        # 空行后的内容添加到结果中
-        content_after_empty_line += line + "\n"
-
-# 如果没有找到空行，保留原始内容
-if not found_empty_line:
-    content_after_empty_line = html_content
-
-content_after_empty_line = content_after_empty_line.replace("\n\n", "\n")
-print(content_after_empty_line)
-# 关闭浏览器
-driver.quit()
--- a/dsLightRag/WxGzh/Util/WxGzhUtil.py
+++ b/dsLightRag/WxGzh/Util/WxGzhUtil.py
@ -0,0 +1,45 @@
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service as ChromeService
+from selenium.webdriver.common.by import By
+
+def get_article_content(url):
+    """
+    获取微信公众号文章内容
+    :param url: 文章URL
+    :return: 文章内容文本
+    """
+    options = Options()
+    options.add_argument('-headless')
+    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
+    driver = webdriver.Chrome(service=service, options=options)
+    
+    try:
+        driver.get(url)
+        html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
+        
+        # 处理内容，提取空行后的文本
+        lines = html_content.split('\n')
+        content_after_empty_line = ""
+        found_empty_line = False
+
+        for line in lines:
+            if not found_empty_line and line.strip() == "":
+                found_empty_line = True
+                continue
+
+            if found_empty_line:
+                content_after_empty_line += line + "\n"
+
+        if not found_empty_line:
+            content_after_empty_line = html_content
+
+        return content_after_empty_line.replace("\n\n", "\n")
+    finally:
+        driver.quit()
+
+if __name__ == '__main__':
+    # 示例用法
+    url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd'
+    content = get_article_content(url)
+    print(content)
--- a/dsLightRag/WxGzh/Util/init.py
+++ b/dsLightRag/WxGzh/Util/init.py