You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
47 lines
1.6 KiB
47 lines
1.6 KiB
from selenium import webdriver
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
|
from selenium.webdriver.common.by import By
|
|
|
|
def init_wechat_browser():
|
|
"""初始化微信爬虫浏览器实例"""
|
|
options = Options()
|
|
options.add_argument('-headless')
|
|
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
|
|
return webdriver.Chrome(service=service, options=options)
|
|
|
|
|
|
def get_article_content(url):
|
|
"""
|
|
获取微信公众号文章内容
|
|
:param url: 文章URL
|
|
:return: 文章内容文本
|
|
"""
|
|
options = Options()
|
|
options.add_argument('-headless')
|
|
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
|
|
driver = webdriver.Chrome(service=service, options=options)
|
|
|
|
try:
|
|
driver.get(url)
|
|
html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
|
|
|
|
# 处理内容,提取空行后的文本
|
|
lines = html_content.split('\n')
|
|
content_after_empty_line = ""
|
|
found_empty_line = False
|
|
|
|
for line in lines:
|
|
if not found_empty_line and line.strip() == "":
|
|
found_empty_line = True
|
|
continue
|
|
|
|
if found_empty_line:
|
|
content_after_empty_line += line + "\n"
|
|
|
|
if not found_empty_line:
|
|
content_after_empty_line = html_content
|
|
|
|
return content_after_empty_line.replace("\n\n", "\n")
|
|
finally:
|
|
driver.quit() |