parent
352d2d71a4
commit
af3f8098c4
@ -0,0 +1,45 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
def get_article_content(url):
|
||||
"""
|
||||
获取微信公众号文章内容
|
||||
:param url: 文章URL
|
||||
:return: 文章内容文本
|
||||
"""
|
||||
options = Options()
|
||||
options.add_argument('-headless')
|
||||
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
|
||||
driver = webdriver.Chrome(service=service, options=options)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
|
||||
|
||||
# 处理内容,提取空行后的文本
|
||||
lines = html_content.split('\n')
|
||||
content_after_empty_line = ""
|
||||
found_empty_line = False
|
||||
|
||||
for line in lines:
|
||||
if not found_empty_line and line.strip() == "":
|
||||
found_empty_line = True
|
||||
continue
|
||||
|
||||
if found_empty_line:
|
||||
content_after_empty_line += line + "\n"
|
||||
|
||||
if not found_empty_line:
|
||||
content_after_empty_line = html_content
|
||||
|
||||
return content_after_empty_line.replace("\n\n", "\n")
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 示例用法
|
||||
url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd'
|
||||
content = get_article_content(url)
|
||||
print(content)
|
Loading…
Reference in new issue