You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.
from selenium import webdriver
from selenium . webdriver . chrome . options import Options
from selenium . webdriver . chrome . service import Service as ChromeService
from selenium . webdriver . common . by import By
url = ' http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd '
options = Options ( )
options . add_argument ( ' -headless ' ) # 无头参数,调试时可以注释掉
service = ChromeService ( executable_path = r " C: \ Windows \ System32 \ chromedriver.exe " )
driver = webdriver . Chrome ( service = service , options = options )
driver . get ( url )
# 可以只要txt
html_content = driver . find_element ( By . CLASS_NAME , " rich_media " ) . text
# 第一行是标题,分离出来
title = html_content . split ( ' \n ' ) [ 0 ]
print ( title )
# 按行遍历html_content, 当发现空行时, 删除空行前面的内容, 只保留后面的内容
lines = html_content . split ( ' \n ' )
content_after_empty_line = " "
found_empty_line = False
for line in lines :
if not found_empty_line and line . strip ( ) == " " :
# 找到第一个空行
found_empty_line = True
continue
if found_empty_line :
# 空行后的内容添加到结果中
content_after_empty_line + = line + " \n "
# 如果没有找到空行,保留原始内容
if not found_empty_line :
content_after_empty_line = html_content
content_after_empty_line = content_after_empty_line . replace ( " \n \n " , " \n " )
print ( content_after_empty_line )
# 关闭浏览器
driver . quit ( )