main
HuangHai 1 week ago
parent 352d2d71a4
commit af3f8098c4

@ -1,41 +0,0 @@
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd'
options = Options()
options.add_argument('-headless') # 无头参数,调试时可以注释掉
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
# 可以只要txt
html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
# 第一行是标题,分离出来
title = html_content.split('\n')[0]
print(title)
# 按行遍历html_content当发现空行时删除空行前面的内容只保留后面的内容
lines = html_content.split('\n')
content_after_empty_line = ""
found_empty_line = False
for line in lines:
if not found_empty_line and line.strip() == "":
# 找到第一个空行
found_empty_line = True
continue
if found_empty_line:
# 空行后的内容添加到结果中
content_after_empty_line += line + "\n"
# 如果没有找到空行,保留原始内容
if not found_empty_line:
content_after_empty_line = html_content
content_after_empty_line = content_after_empty_line.replace("\n\n", "\n")
print(content_after_empty_line)
# 关闭浏览器
driver.quit()

@ -0,0 +1,45 @@
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
def get_article_content(url):
"""
获取微信公众号文章内容
:param url: 文章URL
:return: 文章内容文本
"""
options = Options()
options.add_argument('-headless')
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
try:
driver.get(url)
html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
# 处理内容,提取空行后的文本
lines = html_content.split('\n')
content_after_empty_line = ""
found_empty_line = False
for line in lines:
if not found_empty_line and line.strip() == "":
found_empty_line = True
continue
if found_empty_line:
content_after_empty_line += line + "\n"
if not found_empty_line:
content_after_empty_line = html_content
return content_after_empty_line.replace("\n\n", "\n")
finally:
driver.quit()
if __name__ == '__main__':
# 示例用法
url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd'
content = get_article_content(url)
print(content)
Loading…
Cancel
Save