You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

271 lines
13 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# 详解Python + Selenium 批量采集微信公众号搭建自己的微信公众号每日AI简报告别信息焦虑
# https://blog.csdn.net/k352733625/article/details/149222945
# 微信爬爬猫---公众号文章抓取代码分析
# https://blog.csdn.net/yajuanpi4899/article/details/121584268
import datetime
import logging
import random
import re
import os
import requests
"""
# 查看selenium版本
pip show selenium
4.34.2
# 查看Chrome浏览器版本
chrome://version/
138.0.7204.101 (正式版本) 64 位)
# 下载驱动包
https://googlechromelabs.github.io/chrome-for-testing/
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
"""
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
import json
if __name__ == '__main__':
# 定义一个空的字典存放cookies内容
cookies = {}
# 设置headers - 使用微信内置浏览器的User-Agent
header = {
"HOST": "mp.weixin.qq.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",
"Connection": "keep-alive"
}
# 用webdriver启动谷歌浏览器
logging.info("启动浏览器,打开微信公众号登录界面")
options = Options()
# options.add_argument('-headless') # 无头参数,调试时可以注释掉
# 设置微信内置浏览器的User-Agent
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)')
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
# 打开微信公众号登录页面
driver.get('https://mp.weixin.qq.com/')
# 等待5秒钟
time.sleep(2)
# # 拿手机扫二维码!
logging.info("请拿手机扫码二维码登录公众号")
time.sleep(20)
# 重新载入公众号登录页登录之后会显示公众号后台首页从这个返回内容中获取cookies信息
driver.get('https://mp.weixin.qq.com/')
# 获取cookies
cookie_items = driver.get_cookies()
# 获取到的cookies是列表形式将cookies转成json形式并存入本地名为cookie的文本中
for cookie_item in cookie_items:
cookies[cookie_item['name']] = cookie_item['value']
if "slave_sid" not in cookies:
logging.info("登录公众号失败获取cookie失败")
exit()
# cookies = json.dumps(post) # 注释掉这一行
# 方法3使用requests库发送请求获取重定向URL
url = 'https://mp.weixin.qq.com'
response = requests.get(url=url, allow_redirects=False, cookies=cookies)
if 'Location' in response.headers:
redirect_url = response.headers.get("Location")
print("重定向URL:", redirect_url)
token_match = re.findall(r'token=(\d+)', redirect_url)
if token_match:
token = token_match[0]
print("获取到的token:", token)
logging.info("微信token:" + token)
article_urls = []
gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}]
for item in gzlist:
account_name = item["account_name"]
account_id = item["account_id"]
# 搜索微信公众号的接口地址
search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
# 搜索微信公众号接口需要传入的参数有三个变量微信公众号token、随机数random、搜索的微信公众号名字
query_id = {
'action': 'search_biz',
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'query': account_name,
'begin': '0',
'count': '5'
}
# 打开搜索微信公众号接口地址需要传入相关参数信息如cookies、params、headers
search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
# 取搜索结果中的第一个公众号
lists = search_response.json().get('list')[0]
# 获取这个公众号的fakeid后面爬取公众号文章需要此字段
fakeid = lists.get('fakeid')
logging.info("fakeid:" + fakeid)
# 微信公众号文章接口地址
appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
# 搜索文章需要传入几个参数登录的公众号token、要爬取文章的公众号fakeid、随机数random
query_id_data = {
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'action': 'list_ex',
'begin': '0', # 不同页此参数变化变化规则为每页加5
'count': '5',
'query': '',
'fakeid': fakeid,
'type': '9'
}
# 打开搜索的微信公众号文章列表页
query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
fakeid_list = query_fakeid_response.json().get('app_msg_list')
for item in fakeid_list:
# 采集item示例
new_article = {
'title': item.get('title'),
'article_url': item.get('link'),
'account_id': account_id,
'account_name': account_name,
'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'),
'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
print("new_article:", new_article)
logging.info("new_article:", new_article)
article_urls.append(item.get('link'))
time.sleep(1)
# 确保Logs目录存在
logs_dir = "./Test/Logs"
if not os.path.exists(logs_dir):
os.makedirs(logs_dir)
for article_url in article_urls:
print("正在爬取文章:" + article_url)
try:
# 使用requests直接获取文章内容模拟微信环境
wechat_headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 10; MI 8 Build/QKQ1.190828.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/83.0.4103.101 Mobile Safari/537.36 XWEB/1768 MMWEBSDK/20210302 MMWEBID/6253 MicroMessenger/8.0.2.1860(0x28000234) Process/toolsmp WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
"X-Requested-With": "com.tencent.mm",
"Referer": "https://mp.weixin.qq.com/"
}
# 使用selenium打开文章链接设置请求头
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': wechat_headers})
driver.get(article_url)
# 增加等待时间,确保页面完全加载
time.sleep(5)
# 检查是否需要登录
if "请在微信客户端中打开链接" in driver.page_source or "请在微信中打开此链接" in driver.page_source:
print(f"文章需要在微信中打开尝试使用requests直接获取{article_url}")
# 尝试使用requests直接获取
response = requests.get(article_url, headers=wechat_headers, cookies=cookies)
if "请在微信客户端中打开链接" in response.text or "请在微信中打开此链接" in response.text:
print(f"使用requests仍然无法获取跳过此文章{article_url}")
continue
else:
# 保存获取到的HTML内容
filename = f"article_{article_url.split('sn=')[1][:10] if 'sn=' in article_url else 'unknown'}"
save_path = f"{logs_dir}/{filename}.html"
with open(save_path, "w", encoding="utf-8") as f:
f.write(response.text)
print(f"已保存文章HTML内容{save_path}")
continue
# 使用更可靠的选择器查找标题和内容
try:
# 尝试多种可能的标题选择器
title_selectors = [
'//h1[@class="rich_media_title"]',
'//h1[@id="activity-name"]',
'//h2[@class="rich_media_title"]',
'//div[@class="rich_media_content"]//h1',
'//div[@id="js_article"]//h1'
]
title = None
for selector in title_selectors:
try:
title_element = driver.find_element('xpath', selector)
title = title_element.text.strip()
if title:
break
except:
continue
if not title:
# 如果所有选择器都失败,尝试从页面标题获取
title = driver.title.replace(" - 微信公众号", "").strip()
# 尝试多种可能的内容选择器
content_selectors = [
'//div[@class="rich_media_content"]',
'//div[@id="js_content"]',
'//div[@class="rich_media_wrp"]'
]
content = None
for selector in content_selectors:
try:
content_element = driver.find_element('xpath', selector)
content = content_element.text.strip()
if content:
break
except:
continue
if not content:
# 如果无法获取内容,至少保存页面源码
content = "无法提取正文内容,保存页面源码:\n" + driver.page_source
# 创建文件名(使用标题,但去除不合法的文件名字符)
if not title:
title = "未知标题_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "未知标题"
filename = re.sub(r'[\\/:*?"<>|]', '_', title)
# 保存文章内容到文件
save_path = f"{logs_dir}/{filename}.txt"
with open(save_path, "w", encoding="utf-8") as f:
f.write(f"标题:{title}\n\n")
f.write(f"链接:{article_url}\n\n")
f.write(f"内容:\n{content}")
print(f"文章《{title}》保存成功:{save_path}")
except Exception as e:
print(f"提取文章内容失败:{str(e)}")
# 保存页面源码以便分析
error_filename = "error_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "error_page"
error_path = f"{logs_dir}/{error_filename}.html"
with open(error_path, "w", encoding="utf-8") as f:
f.write(driver.page_source)
print(f"已保存页面源码到:{error_path}")
# 避免频繁请求被封
time.sleep(random.uniform(3, 7))
except Exception as e:
print(f"爬取文章失败:{article_url},错误信息:{str(e)}")
continue
# 关闭浏览器
driver.quit()
print("所有文章爬取完成!")