main
HuangHai 1 week ago
parent d9583daf28
commit d57b2b94c5

@ -4,15 +4,8 @@
# 微信爬爬猫---公众号文章抓取代码分析 # 微信爬爬猫---公众号文章抓取代码分析
# https://blog.csdn.net/yajuanpi4899/article/details/121584268 # https://blog.csdn.net/yajuanpi4899/article/details/121584268
"""
安装pdfkit库
复制
pip3 install pdfkit -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
1.
import pdfkit
pdfkit.from_url('公众号文章地址', 'out.pdf')
"""
import datetime import datetime
import json
import logging import logging
import random import random
import re import re
@ -52,11 +45,6 @@ if __name__ == '__main__':
# 用webdriver启动谷歌浏览器 # 用webdriver启动谷歌浏览器
logging.info("启动浏览器,打开微信公众号登录界面") logging.info("启动浏览器,打开微信公众号登录界面")
options = Options() options = Options()
# options.add_argument('-headless') # 无头参数,调试时可以注释掉
# 设置微信内置浏览器的User-Agent
options.add_argument(
'--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)')
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options) driver = webdriver.Chrome(service=service, options=options)
@ -81,82 +69,10 @@ if __name__ == '__main__':
exit() exit()
# cookies = json.dumps(post) # 注释掉这一行 # cookies = json.dumps(post) # 注释掉这一行
# 方法3使用requests库发送请求获取重定向URL # 将cookies写入文件
url = 'https://mp.weixin.qq.com' with open('cookies.txt', mode='w', encoding="utf-8") as f:
response = requests.get(url=url, allow_redirects=False, cookies=cookies) f.write(json.dumps(cookies))
if 'Location' in response.headers: # 关闭浏览器
redirect_url = response.headers.get("Location")
print("重定向URL:", redirect_url)
token_match = re.findall(r'token=(\d+)', redirect_url)
if token_match:
token = token_match[0]
print("获取到的token:", token)
logging.info("微信token:" + token)
article_urls = []
gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}]
for item in gzlist:
account_name = item["account_name"]
account_id = item["account_id"]
# 搜索微信公众号的接口地址
search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
# 搜索微信公众号接口需要传入的参数有三个变量微信公众号token、随机数random、搜索的微信公众号名字
query_id = {
'action': 'search_biz',
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'query': account_name,
'begin': '0',
'count': '5'
}
# 打开搜索微信公众号接口地址需要传入相关参数信息如cookies、params、headers
search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
# 取搜索结果中的第一个公众号
lists = search_response.json().get('list')[0]
# 获取这个公众号的fakeid后面爬取公众号文章需要此字段
fakeid = lists.get('fakeid')
logging.info("fakeid:" + fakeid)
# 微信公众号文章接口地址
appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
# 搜索文章需要传入几个参数登录的公众号token、要爬取文章的公众号fakeid、随机数random
query_id_data = {
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'action': 'list_ex',
'begin': '0', # 不同页此参数变化变化规则为每页加5
'count': '5',
'query': '',
'fakeid': fakeid,
'type': '9'
}
# 打开搜索的微信公众号文章列表页
query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
fakeid_list = query_fakeid_response.json().get('app_msg_list')
for item in fakeid_list:
# 采集item示例
new_article = {
'title': item.get('title'),
'article_url': item.get('link'),
'account_id': account_id,
'account_name': account_name,
'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime(
'%Y-%m-%d %H:%M:%S'),
'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
logging.info("new_article:", new_article)
article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')})
time.sleep(1)
for x in article_urls:
print(x)
# 关闭浏览器
driver.quit() driver.quit()
print("所有文章爬取完成!") # 输出提示
print("成功获取了cookies内容")

@ -13,6 +13,7 @@ import pdfkit
pdfkit.from_url('公众号文章地址', 'out.pdf') pdfkit.from_url('公众号文章地址', 'out.pdf')
""" """
import datetime import datetime
import json
import logging import logging
import random import random
import re import re
@ -38,8 +39,12 @@ from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.chrome.service import Service as ChromeService
if __name__ == '__main__': if __name__ == '__main__':
# 定义一个空的字典存放cookies内容 # 从文件cookies.txt中获取
cookies = {} with open('cookies.txt', 'r', encoding='utf-8') as f:
content = f.read()
# 使用json还原为json对象
cookies = json.loads(content)
options = Options()
# 设置headers - 使用微信内置浏览器的User-Agent # 设置headers - 使用微信内置浏览器的User-Agent
header = { header = {
"HOST": "mp.weixin.qq.com", "HOST": "mp.weixin.qq.com",
@ -49,37 +54,9 @@ if __name__ == '__main__':
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",
"Connection": "keep-alive" "Connection": "keep-alive"
} }
# 用webdriver启动谷歌浏览器
logging.info("启动浏览器,打开微信公众号登录界面")
options = Options()
# options.add_argument('-headless') # 无头参数,调试时可以注释掉
# 设置微信内置浏览器的User-Agent
options.add_argument(
'--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)')
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options) driver = webdriver.Chrome(service=service, options=options)
# 打开微信公众号登录页面
driver.get('https://mp.weixin.qq.com/')
# 等待5秒钟
time.sleep(2)
# # 拿手机扫二维码!
logging.info("请拿手机扫码二维码登录公众号")
time.sleep(20)
# 重新载入公众号登录页登录之后会显示公众号后台首页从这个返回内容中获取cookies信息
driver.get('https://mp.weixin.qq.com/')
# 获取cookies
cookie_items = driver.get_cookies()
# 获取到的cookies是列表形式将cookies转成json形式并存入本地名为cookie的文本中
for cookie_item in cookie_items:
cookies[cookie_item['name']] = cookie_item['value']
if "slave_sid" not in cookies:
logging.info("登录公众号失败获取cookie失败")
exit()
# cookies = json.dumps(post) # 注释掉这一行
# 方法3使用requests库发送请求获取重定向URL # 方法3使用requests库发送请求获取重定向URL
url = 'https://mp.weixin.qq.com' url = 'https://mp.weixin.qq.com'
@ -151,11 +128,17 @@ if __name__ == '__main__':
'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
} }
logging.info("new_article:", new_article) logging.info("new_article:", new_article)
article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')}) article_urls.append({"title": item.get('title'), "url": item.get('link'),
"publish_time": datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime(
'%Y-%m-%d %H:%M:%S')})
time.sleep(1) time.sleep(1)
for x in article_urls: for x in article_urls:
print(x) print(x)
# 将返回的地址写入到文件
with open('article_urls.txt', 'w', encoding='utf-8') as f:
for url in article_urls:
f.write(url + '\n')
# 关闭浏览器 # 关闭浏览器
driver.quit() driver.quit()

@ -0,0 +1 @@
{"_clsk": "1v8cz8t|1752541383487|1|1|mp.weixin.qq.com/weheat-agent/payload/record", "xid": "fff1911b542cde79c5c47a38cb3929c8", "data_bizuin": "3514353238", "slave_user": "gh_4f88a4e194da", "slave_sid": "cDlUaWlaek5RZHV6SUIyVWNNZlJGYTJQdHY5YzUyN29LMG94RlptUV9lbkVDUWxmaTBURFE5YWNKeVRkYlZSdU9VRnNjWXRKN2xfZ2pZd0JWal82aVpsRDhqUnJXQkdYMml4SlhrdGtGY2k2MG95YTlQVEFVanpIR01oZ3p4dldiME9hRE1zcGxZV0FlNTVV", "rand_info": "CAESIPFuk5/nui6QoQ6zEO2B5RfaUmjuQjTJOQVg9mBuI/XG", "data_ticket": "AIy4PwNlFMRBDHcZ7jcXDXf/8fFLl5NS25Nj3tYuDL8H4W8EiURU4G9Dakn7aSUC", "bizuin": "3514353238", "mm_lang": "zh_CN", "slave_bizuin": "3514353238", "uuid": "91eaae9bc5e4f725e03ee2b7e75c8a2c", "ua_id": "bbkG1LsuVI1DszGdAAAAADm2HzejXloc87mSyGEMpdY=", "wxuin": "52541365079710", "_clck": "1l32fbr|1|fxm|0"}
Loading…
Cancel
Save