|
|
@ -4,15 +4,8 @@
|
|
|
|
# 微信爬爬猫---公众号文章抓取代码分析
|
|
|
|
# 微信爬爬猫---公众号文章抓取代码分析
|
|
|
|
# https://blog.csdn.net/yajuanpi4899/article/details/121584268
|
|
|
|
# https://blog.csdn.net/yajuanpi4899/article/details/121584268
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
安装pdfkit库
|
|
|
|
|
|
|
|
复制
|
|
|
|
|
|
|
|
pip3 install pdfkit -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
|
|
|
|
|
|
|
|
1.
|
|
|
|
|
|
|
|
import pdfkit
|
|
|
|
|
|
|
|
pdfkit.from_url('公众号文章地址', 'out.pdf')
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
import datetime
|
|
|
|
import datetime
|
|
|
|
|
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
import logging
|
|
|
|
import random
|
|
|
|
import random
|
|
|
|
import re
|
|
|
|
import re
|
|
|
@ -52,11 +45,6 @@ if __name__ == '__main__':
|
|
|
|
# 用webdriver启动谷歌浏览器
|
|
|
|
# 用webdriver启动谷歌浏览器
|
|
|
|
logging.info("启动浏览器,打开微信公众号登录界面")
|
|
|
|
logging.info("启动浏览器,打开微信公众号登录界面")
|
|
|
|
options = Options()
|
|
|
|
options = Options()
|
|
|
|
# options.add_argument('-headless') # 无头参数,调试时可以注释掉
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 设置微信内置浏览器的User-Agent
|
|
|
|
|
|
|
|
options.add_argument(
|
|
|
|
|
|
|
|
'--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
|
|
|
|
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
|
|
|
|
driver = webdriver.Chrome(service=service, options=options)
|
|
|
|
driver = webdriver.Chrome(service=service, options=options)
|
|
|
@ -81,82 +69,10 @@ if __name__ == '__main__':
|
|
|
|
exit()
|
|
|
|
exit()
|
|
|
|
# cookies = json.dumps(post) # 注释掉这一行
|
|
|
|
# cookies = json.dumps(post) # 注释掉这一行
|
|
|
|
|
|
|
|
|
|
|
|
# 方法3:使用requests库发送请求获取重定向URL
|
|
|
|
# 将cookies写入文件
|
|
|
|
url = 'https://mp.weixin.qq.com'
|
|
|
|
with open('cookies.txt', mode='w', encoding="utf-8") as f:
|
|
|
|
response = requests.get(url=url, allow_redirects=False, cookies=cookies)
|
|
|
|
f.write(json.dumps(cookies))
|
|
|
|
if 'Location' in response.headers:
|
|
|
|
# 关闭浏览器
|
|
|
|
redirect_url = response.headers.get("Location")
|
|
|
|
|
|
|
|
print("重定向URL:", redirect_url)
|
|
|
|
|
|
|
|
token_match = re.findall(r'token=(\d+)', redirect_url)
|
|
|
|
|
|
|
|
if token_match:
|
|
|
|
|
|
|
|
token = token_match[0]
|
|
|
|
|
|
|
|
print("获取到的token:", token)
|
|
|
|
|
|
|
|
logging.info("微信token:" + token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
article_urls = []
|
|
|
|
|
|
|
|
gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}]
|
|
|
|
|
|
|
|
for item in gzlist:
|
|
|
|
|
|
|
|
account_name = item["account_name"]
|
|
|
|
|
|
|
|
account_id = item["account_id"]
|
|
|
|
|
|
|
|
# 搜索微信公众号的接口地址
|
|
|
|
|
|
|
|
search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
|
|
|
|
|
|
|
|
# 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字
|
|
|
|
|
|
|
|
query_id = {
|
|
|
|
|
|
|
|
'action': 'search_biz',
|
|
|
|
|
|
|
|
'token': token,
|
|
|
|
|
|
|
|
'lang': 'zh_CN',
|
|
|
|
|
|
|
|
'f': 'json',
|
|
|
|
|
|
|
|
'ajax': '1',
|
|
|
|
|
|
|
|
'random': random.random(),
|
|
|
|
|
|
|
|
'query': account_name,
|
|
|
|
|
|
|
|
'begin': '0',
|
|
|
|
|
|
|
|
'count': '5'
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
# 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers
|
|
|
|
|
|
|
|
search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
|
|
|
|
|
|
|
|
# 取搜索结果中的第一个公众号
|
|
|
|
|
|
|
|
lists = search_response.json().get('list')[0]
|
|
|
|
|
|
|
|
# 获取这个公众号的fakeid,后面爬取公众号文章需要此字段
|
|
|
|
|
|
|
|
fakeid = lists.get('fakeid')
|
|
|
|
|
|
|
|
logging.info("fakeid:" + fakeid)
|
|
|
|
|
|
|
|
# 微信公众号文章接口地址
|
|
|
|
|
|
|
|
appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
|
|
|
|
|
|
|
|
# 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random
|
|
|
|
|
|
|
|
query_id_data = {
|
|
|
|
|
|
|
|
'token': token,
|
|
|
|
|
|
|
|
'lang': 'zh_CN',
|
|
|
|
|
|
|
|
'f': 'json',
|
|
|
|
|
|
|
|
'ajax': '1',
|
|
|
|
|
|
|
|
'random': random.random(),
|
|
|
|
|
|
|
|
'action': 'list_ex',
|
|
|
|
|
|
|
|
'begin': '0', # 不同页,此参数变化,变化规则为每页加5
|
|
|
|
|
|
|
|
'count': '5',
|
|
|
|
|
|
|
|
'query': '',
|
|
|
|
|
|
|
|
'fakeid': fakeid,
|
|
|
|
|
|
|
|
'type': '9'
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
# 打开搜索的微信公众号文章列表页
|
|
|
|
|
|
|
|
query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
|
|
|
|
|
|
|
|
fakeid_list = query_fakeid_response.json().get('app_msg_list')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for item in fakeid_list:
|
|
|
|
|
|
|
|
# 采集item示例
|
|
|
|
|
|
|
|
new_article = {
|
|
|
|
|
|
|
|
'title': item.get('title'),
|
|
|
|
|
|
|
|
'article_url': item.get('link'),
|
|
|
|
|
|
|
|
'account_id': account_id,
|
|
|
|
|
|
|
|
'account_name': account_name,
|
|
|
|
|
|
|
|
'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime(
|
|
|
|
|
|
|
|
'%Y-%m-%d %H:%M:%S'),
|
|
|
|
|
|
|
|
'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
logging.info("new_article:", new_article)
|
|
|
|
|
|
|
|
article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')})
|
|
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for x in article_urls:
|
|
|
|
|
|
|
|
print(x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 关闭浏览器
|
|
|
|
|
|
|
|
driver.quit()
|
|
|
|
driver.quit()
|
|
|
|
print("所有文章爬取完成!")
|
|
|
|
# 输出提示
|
|
|
|
|
|
|
|
print("成功获取了cookies内容!")
|
|
|
|