You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

152 lines
6.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# 详解Python + Selenium 批量采集微信公众号搭建自己的微信公众号每日AI简报告别信息焦虑
# https://blog.csdn.net/k352733625/article/details/149222945
# 微信爬爬猫---公众号文章抓取代码分析
# https://blog.csdn.net/yajuanpi4899/article/details/121584268
import datetime
import logging
import random
import re
import requests
# 1、安装Firefox软件【最新】
# https://www.firefox.com.cn/download/#product-desktop-release
# 2、下载geckodriver驱动【最新】
# https://splinter-docs-zh-cn.readthedocs.io/zh/latest/drivers/firefox.html
# https://github.com/mozilla/geckodriver/releases
# 3、Python爬虫实战系列微信公众号文章爬取的5种技术方案总结及代码示例
# 方案5微信公众号后台引用链接方式爬取
# https://blog.csdn.net/Python_trys/article/details/146506009
"""
# 查看selenium版本
pip show selenium
4.34.2
# 查看Chrome浏览器版本
chrome://version/
138.0.7204.101 (正式版本) 64 位)
# 下载驱动包
https://googlechromelabs.github.io/chrome-for-testing/
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
"""
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
import json
if __name__ == '__main__':
# 定义一个空的字典存放cookies内容
cookies = {}
# 设置headers
header = {
"HOST": "mp.weixin.qq.com",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0"
}
# 用webdriver启动谷歌浏览器
logging.info("启动浏览器,打开微信公众号登录界面")
options = Options()
options.add_argument('-headless') # 无头参数
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
driver = webdriver.Chrome(service=service)
# 打开微信公众号登录页面
driver.get('https://mp.weixin.qq.com/')
# 等待5秒钟
time.sleep(2)
# # 拿手机扫二维码!
logging.info("请拿手机扫码二维码登录公众号")
time.sleep(20)
# 重新载入公众号登录页登录之后会显示公众号后台首页从这个返回内容中获取cookies信息
driver.get('https://mp.weixin.qq.com/')
# 获取cookies
cookie_items = driver.get_cookies()
# 获取到的cookies是列表形式将cookies转成json形式并存入本地名为cookie的文本中
for cookie_item in cookie_items:
cookies[cookie_item['name']] = cookie_item['value']
if "slave_sid" not in cookies:
logging.info("登录公众号失败获取cookie失败")
exit()
# cookies = json.dumps(post) # 注释掉这一行
# 方法3使用requests库发送请求获取重定向URL
url = 'https://mp.weixin.qq.com'
response = requests.get(url=url, allow_redirects=False, cookies=cookies)
if 'Location' in response.headers:
redirect_url = response.headers.get("Location")
print("重定向URL:", redirect_url)
token_match = re.findall(r'token=(\d+)', redirect_url)
if token_match:
token = token_match[0]
print("获取到的token:", token)
logging.info("微信token:" + token)
article_urls = []
gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}]
for item in gzlist:
account_name = item["account_name"]
account_id = item["account_id"]
# 搜索微信公众号的接口地址
search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
# 搜索微信公众号接口需要传入的参数有三个变量微信公众号token、随机数random、搜索的微信公众号名字
query_id = {
'action': 'search_biz',
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'query': account_name,
'begin': '0',
'count': '5'
}
# 打开搜索微信公众号接口地址需要传入相关参数信息如cookies、params、headers
search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
# 取搜索结果中的第一个公众号
lists = search_response.json().get('list')[0]
# 获取这个公众号的fakeid后面爬取公众号文章需要此字段
fakeid = lists.get('fakeid')
logging.info("fakeid:" + fakeid)
# 微信公众号文章接口地址
appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
# 搜索文章需要传入几个参数登录的公众号token、要爬取文章的公众号fakeid、随机数random
query_id_data = {
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'action': 'list_ex',
'begin': '0', # 不同页此参数变化变化规则为每页加5
'count': '5',
'query': '',
'fakeid': fakeid,
'type': '9'
}
# 打开搜索的微信公众号文章列表页
query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
fakeid_list = query_fakeid_response.json().get('app_msg_list')
item = fakeid_list[0]
# 采集item示例
new_article = {
'title': item.get('title'),
'article_url': item.get('link'),
'account_id': account_id,
'account_name': account_name,
'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'),
'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
logging.info("new_article:", new_article)
article_urls.append(item.get('link'))
time.sleep(2)
for article_url in article_urls:
print("正在爬取文章:" + article_url)