|
|
@ -7,20 +7,10 @@ import datetime
|
|
|
|
import logging
|
|
|
|
import logging
|
|
|
|
import random
|
|
|
|
import random
|
|
|
|
import re
|
|
|
|
import re
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
|
|
# 1、安装Firefox软件【最新】
|
|
|
|
|
|
|
|
# https://www.firefox.com.cn/download/#product-desktop-release
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 2、下载geckodriver驱动【最新】
|
|
|
|
|
|
|
|
# https://splinter-docs-zh-cn.readthedocs.io/zh/latest/drivers/firefox.html
|
|
|
|
|
|
|
|
# https://github.com/mozilla/geckodriver/releases
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 3、Python爬虫实战系列:微信公众号文章爬取的5种技术方案总结及代码示例!
|
|
|
|
|
|
|
|
# 方案5:微信公众号后台引用链接方式爬取
|
|
|
|
|
|
|
|
# https://blog.csdn.net/Python_trys/article/details/146506009
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
# 查看selenium版本
|
|
|
|
# 查看selenium版本
|
|
|
|
pip show selenium
|
|
|
|
pip show selenium
|
|
|
@ -43,17 +33,25 @@ import json
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# 定义一个空的字典,存放cookies内容
|
|
|
|
# 定义一个空的字典,存放cookies内容
|
|
|
|
cookies = {}
|
|
|
|
cookies = {}
|
|
|
|
# 设置headers
|
|
|
|
# 设置headers - 使用微信内置浏览器的User-Agent
|
|
|
|
header = {
|
|
|
|
header = {
|
|
|
|
"HOST": "mp.weixin.qq.com",
|
|
|
|
"HOST": "mp.weixin.qq.com",
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0"
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)",
|
|
|
|
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
|
|
|
|
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
|
|
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",
|
|
|
|
|
|
|
|
"Connection": "keep-alive"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# 用webdriver启动谷歌浏览器
|
|
|
|
# 用webdriver启动谷歌浏览器
|
|
|
|
logging.info("启动浏览器,打开微信公众号登录界面")
|
|
|
|
logging.info("启动浏览器,打开微信公众号登录界面")
|
|
|
|
options = Options()
|
|
|
|
options = Options()
|
|
|
|
options.add_argument('-headless') # 无头参数
|
|
|
|
# options.add_argument('-headless') # 无头参数,调试时可以注释掉
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 设置微信内置浏览器的User-Agent
|
|
|
|
|
|
|
|
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)')
|
|
|
|
|
|
|
|
|
|
|
|
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
|
|
|
|
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
|
|
|
|
driver = webdriver.Chrome(service=service)
|
|
|
|
driver = webdriver.Chrome(service=service, options=options)
|
|
|
|
# 打开微信公众号登录页面
|
|
|
|
# 打开微信公众号登录页面
|
|
|
|
driver.get('https://mp.weixin.qq.com/')
|
|
|
|
driver.get('https://mp.weixin.qq.com/')
|
|
|
|
# 等待5秒钟
|
|
|
|
# 等待5秒钟
|
|
|
@ -143,10 +141,130 @@ if __name__ == '__main__':
|
|
|
|
'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
print("new_article:", new_article)
|
|
|
|
logging.info("new_article:", new_article)
|
|
|
|
logging.info("new_article:", new_article)
|
|
|
|
article_urls.append(item.get('link'))
|
|
|
|
article_urls.append(item.get('link'))
|
|
|
|
time.sleep(1)
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 确保Logs目录存在
|
|
|
|
|
|
|
|
logs_dir = "./Test/Logs"
|
|
|
|
|
|
|
|
if not os.path.exists(logs_dir):
|
|
|
|
|
|
|
|
os.makedirs(logs_dir)
|
|
|
|
|
|
|
|
|
|
|
|
for article_url in article_urls:
|
|
|
|
for article_url in article_urls:
|
|
|
|
print("正在爬取文章:" + article_url)
|
|
|
|
print("正在爬取文章:" + article_url)
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
# 使用requests直接获取文章内容,模拟微信环境
|
|
|
|
|
|
|
|
wechat_headers = {
|
|
|
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Linux; Android 10; MI 8 Build/QKQ1.190828.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/83.0.4103.101 Mobile Safari/537.36 XWEB/1768 MMWEBSDK/20210302 MMWEBID/6253 MicroMessenger/8.0.2.1860(0x28000234) Process/toolsmp WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64",
|
|
|
|
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
|
|
|
|
|
|
"Accept-Encoding": "gzip, deflate",
|
|
|
|
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
|
|
|
|
|
|
"X-Requested-With": "com.tencent.mm",
|
|
|
|
|
|
|
|
"Referer": "https://mp.weixin.qq.com/"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 使用selenium打开文章链接,设置请求头
|
|
|
|
|
|
|
|
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': wechat_headers})
|
|
|
|
|
|
|
|
driver.get(article_url)
|
|
|
|
|
|
|
|
# 增加等待时间,确保页面完全加载
|
|
|
|
|
|
|
|
time.sleep(5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否需要登录
|
|
|
|
|
|
|
|
if "请在微信客户端中打开链接" in driver.page_source or "请在微信中打开此链接" in driver.page_source:
|
|
|
|
|
|
|
|
print(f"文章需要在微信中打开,尝试使用requests直接获取:{article_url}")
|
|
|
|
|
|
|
|
# 尝试使用requests直接获取
|
|
|
|
|
|
|
|
response = requests.get(article_url, headers=wechat_headers, cookies=cookies)
|
|
|
|
|
|
|
|
if "请在微信客户端中打开链接" in response.text or "请在微信中打开此链接" in response.text:
|
|
|
|
|
|
|
|
print(f"使用requests仍然无法获取,跳过此文章:{article_url}")
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# 保存获取到的HTML内容
|
|
|
|
|
|
|
|
filename = f"article_{article_url.split('sn=')[1][:10] if 'sn=' in article_url else 'unknown'}"
|
|
|
|
|
|
|
|
save_path = f"{logs_dir}/{filename}.html"
|
|
|
|
|
|
|
|
with open(save_path, "w", encoding="utf-8") as f:
|
|
|
|
|
|
|
|
f.write(response.text)
|
|
|
|
|
|
|
|
print(f"已保存文章HTML内容:{save_path}")
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 使用更可靠的选择器查找标题和内容
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
# 尝试多种可能的标题选择器
|
|
|
|
|
|
|
|
title_selectors = [
|
|
|
|
|
|
|
|
'//h1[@class="rich_media_title"]',
|
|
|
|
|
|
|
|
'//h1[@id="activity-name"]',
|
|
|
|
|
|
|
|
'//h2[@class="rich_media_title"]',
|
|
|
|
|
|
|
|
'//div[@class="rich_media_content"]//h1',
|
|
|
|
|
|
|
|
'//div[@id="js_article"]//h1'
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
title = None
|
|
|
|
|
|
|
|
for selector in title_selectors:
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
title_element = driver.find_element('xpath', selector)
|
|
|
|
|
|
|
|
title = title_element.text.strip()
|
|
|
|
|
|
|
|
if title:
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
except:
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not title:
|
|
|
|
|
|
|
|
# 如果所有选择器都失败,尝试从页面标题获取
|
|
|
|
|
|
|
|
title = driver.title.replace(" - 微信公众号", "").strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 尝试多种可能的内容选择器
|
|
|
|
|
|
|
|
content_selectors = [
|
|
|
|
|
|
|
|
'//div[@class="rich_media_content"]',
|
|
|
|
|
|
|
|
'//div[@id="js_content"]',
|
|
|
|
|
|
|
|
'//div[@class="rich_media_wrp"]'
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
content = None
|
|
|
|
|
|
|
|
for selector in content_selectors:
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
content_element = driver.find_element('xpath', selector)
|
|
|
|
|
|
|
|
content = content_element.text.strip()
|
|
|
|
|
|
|
|
if content:
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
except:
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not content:
|
|
|
|
|
|
|
|
# 如果无法获取内容,至少保存页面源码
|
|
|
|
|
|
|
|
content = "无法提取正文内容,保存页面源码:\n" + driver.page_source
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 创建文件名(使用标题,但去除不合法的文件名字符)
|
|
|
|
|
|
|
|
if not title:
|
|
|
|
|
|
|
|
title = "未知标题_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "未知标题"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
filename = re.sub(r'[\\/:*?"<>|]', '_', title)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 保存文章内容到文件
|
|
|
|
|
|
|
|
save_path = f"{logs_dir}/{filename}.txt"
|
|
|
|
|
|
|
|
with open(save_path, "w", encoding="utf-8") as f:
|
|
|
|
|
|
|
|
f.write(f"标题:{title}\n\n")
|
|
|
|
|
|
|
|
f.write(f"链接:{article_url}\n\n")
|
|
|
|
|
|
|
|
f.write(f"内容:\n{content}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"文章《{title}》保存成功:{save_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
print(f"提取文章内容失败:{str(e)}")
|
|
|
|
|
|
|
|
# 保存页面源码以便分析
|
|
|
|
|
|
|
|
error_filename = "error_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "error_page"
|
|
|
|
|
|
|
|
error_path = f"{logs_dir}/{error_filename}.html"
|
|
|
|
|
|
|
|
with open(error_path, "w", encoding="utf-8") as f:
|
|
|
|
|
|
|
|
f.write(driver.page_source)
|
|
|
|
|
|
|
|
print(f"已保存页面源码到:{error_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 避免频繁请求被封
|
|
|
|
|
|
|
|
time.sleep(random.uniform(3, 7))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
print(f"爬取文章失败:{article_url},错误信息:{str(e)}")
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 关闭浏览器
|
|
|
|
|
|
|
|
driver.quit()
|
|
|
|
|
|
|
|
print("所有文章爬取完成!")
|
|
|
|
|
|
|
|
|
|
|
|