You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

153 lines
6.2 KiB

2 weeks ago
# 详解Python + Selenium 批量采集微信公众号搭建自己的微信公众号每日AI简报告别信息焦虑
# https://blog.csdn.net/k352733625/article/details/149222945
2 weeks ago
# 微信爬爬猫---公众号文章抓取代码分析
# https://blog.csdn.net/yajuanpi4899/article/details/121584268
import datetime
2 weeks ago
import logging
2 weeks ago
import random
2 weeks ago
import re
import requests
2 weeks ago
# 1、安装Firefox软件【最新】
# https://www.firefox.com.cn/download/#product-desktop-release
# 2、下载geckodriver驱动【最新】
# https://splinter-docs-zh-cn.readthedocs.io/zh/latest/drivers/firefox.html
# https://github.com/mozilla/geckodriver/releases
2 weeks ago
# 3、Python爬虫实战系列微信公众号文章爬取的5种技术方案总结及代码示例
# 方案5微信公众号后台引用链接方式爬取
# https://blog.csdn.net/Python_trys/article/details/146506009
2 weeks ago
"""
# 查看selenium版本
pip show selenium
4.34.2
# 查看Chrome浏览器版本
chrome://version/
138.0.7204.101 (正式版本) 64
# 下载驱动包
https://googlechromelabs.github.io/chrome-for-testing/
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
"""
2 weeks ago
import time
2 weeks ago
from selenium import webdriver
2 weeks ago
from selenium.webdriver.chrome.options import Options
2 weeks ago
from selenium.webdriver.chrome.service import Service as ChromeService
import json
2 weeks ago
if __name__ == '__main__':
2 weeks ago
# 定义一个空的字典存放cookies内容
2 weeks ago
cookies = {}
# 设置headers
header = {
"HOST": "mp.weixin.qq.com",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0"
}
2 weeks ago
# 用webdriver启动谷歌浏览器
logging.info("启动浏览器,打开微信公众号登录界面")
options = Options()
options.add_argument('-headless') # 无头参数
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
driver = webdriver.Chrome(service=service)
# 打开微信公众号登录页面
driver.get('https://mp.weixin.qq.com/')
# 等待5秒钟
time.sleep(2)
# # 拿手机扫二维码!
logging.info("请拿手机扫码二维码登录公众号")
time.sleep(20)
2 weeks ago
2 weeks ago
# 重新载入公众号登录页登录之后会显示公众号后台首页从这个返回内容中获取cookies信息
driver.get('https://mp.weixin.qq.com/')
# 获取cookies
cookie_items = driver.get_cookies()
# 获取到的cookies是列表形式将cookies转成json形式并存入本地名为cookie的文本中
for cookie_item in cookie_items:
2 weeks ago
cookies[cookie_item['name']] = cookie_item['value']
2 weeks ago
2 weeks ago
if "slave_sid" not in cookies:
2 weeks ago
logging.info("登录公众号失败获取cookie失败")
2 weeks ago
exit()
2 weeks ago
# cookies = json.dumps(post) # 注释掉这一行
2 weeks ago
2 weeks ago
# 方法3使用requests库发送请求获取重定向URL
url = 'https://mp.weixin.qq.com'
2 weeks ago
response = requests.get(url=url, allow_redirects=False, cookies=cookies)
2 weeks ago
if 'Location' in response.headers:
redirect_url = response.headers.get("Location")
print("重定向URL:", redirect_url)
token_match = re.findall(r'token=(\d+)', redirect_url)
if token_match:
token = token_match[0]
print("获取到的token:", token)
logging.info("微信token:" + token)
2 weeks ago
2 weeks ago
article_urls = []
gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}]
for item in gzlist:
account_name = item["account_name"]
account_id = item["account_id"]
# 搜索微信公众号的接口地址
search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
# 搜索微信公众号接口需要传入的参数有三个变量微信公众号token、随机数random、搜索的微信公众号名字
query_id = {
'action': 'search_biz',
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'query': account_name,
'begin': '0',
'count': '5'
}
# 打开搜索微信公众号接口地址需要传入相关参数信息如cookies、params、headers
search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
# 取搜索结果中的第一个公众号
lists = search_response.json().get('list')[0]
# 获取这个公众号的fakeid后面爬取公众号文章需要此字段
fakeid = lists.get('fakeid')
logging.info("fakeid:" + fakeid)
# 微信公众号文章接口地址
appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
# 搜索文章需要传入几个参数登录的公众号token、要爬取文章的公众号fakeid、随机数random
query_id_data = {
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'action': 'list_ex',
'begin': '0', # 不同页此参数变化变化规则为每页加5
'count': '5',
'query': '',
'fakeid': fakeid,
'type': '9'
}
# 打开搜索的微信公众号文章列表页
query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
fakeid_list = query_fakeid_response.json().get('app_msg_list')
2 weeks ago
for item in fakeid_list:
# 采集item示例
new_article = {
'title': item.get('title'),
'article_url': item.get('link'),
'account_id': account_id,
'account_name': account_name,
'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'),
'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
logging.info("new_article:", new_article)
article_urls.append(item.get('link'))
time.sleep(1)
2 weeks ago
for article_url in article_urls:
print("正在爬取文章:" + article_url)