From be105894af0613741967ec2eb153cce1ef09e82f Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 14 Jul 2025 15:07:59 +0800 Subject: [PATCH] 'commit' --- dsLightRag/Test/TestCrawl.py | 94 +++++++++++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 12 deletions(-) diff --git a/dsLightRag/Test/TestCrawl.py b/dsLightRag/Test/TestCrawl.py index ba3d7b66..95e70278 100644 --- a/dsLightRag/Test/TestCrawl.py +++ b/dsLightRag/Test/TestCrawl.py @@ -1,6 +1,11 @@ # 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 # https://blog.csdn.net/k352733625/article/details/149222945 + +# 微信爬爬猫---公众号文章抓取代码分析 +# https://blog.csdn.net/yajuanpi4899/article/details/121584268 +import datetime import logging +import random import re import requests @@ -12,6 +17,10 @@ import requests # https://splinter-docs-zh-cn.readthedocs.io/zh/latest/drivers/firefox.html # https://github.com/mozilla/geckodriver/releases +# 3、Python爬虫实战系列:微信公众号文章爬取的5种技术方案总结及代码示例! +# 方案5:微信公众号后台引用链接方式爬取 +# https://blog.csdn.net/Python_trys/article/details/146506009 + """ # 查看selenium版本 pip show selenium @@ -31,12 +40,14 @@ from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService import json - - - if __name__ == '__main__': # 定义一个空的字典,存放cookies内容 - post = {} + cookies = {} + # 设置headers + header = { + "HOST": "mp.weixin.qq.com", + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0" + } # 用webdriver启动谷歌浏览器 logging.info("启动浏览器,打开微信公众号登录界面") options = Options() @@ -50,22 +61,23 @@ if __name__ == '__main__': # # 拿手机扫二维码! logging.info("请拿手机扫码二维码登录公众号") time.sleep(20) + # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 driver.get('https://mp.weixin.qq.com/') # 获取cookies cookie_items = driver.get_cookies() # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 for cookie_item in cookie_items: - post[cookie_item['name']] = cookie_item['value'] + cookies[cookie_item['name']] = cookie_item['value'] - if "slave_sid" not in post: + if "slave_sid" not in cookies: logging.info("登录公众号失败,获取cookie失败") exit() - cookies = json.dumps(post) + # cookies = json.dumps(post) # 注释掉这一行 # 方法3:使用requests库发送请求获取重定向URL url = 'https://mp.weixin.qq.com' - response = requests.get(url=url, allow_redirects=False, cookies=post) + response = requests.get(url=url, allow_redirects=False, cookies=cookies) if 'Location' in response.headers: redirect_url = response.headers.get("Location") print("重定向URL:", redirect_url) @@ -75,7 +87,65 @@ if __name__ == '__main__': print("获取到的token:", token) logging.info("微信token:" + token) - #url = 'https://mp.weixin.qq.com' - #response = requests.get(url=url, allow_redirects=False, cookies=cookies) - #token = re.findall(r'token=(\d+)', str(response.headers.get("Location")))[0] - #logging.info("微信token:" + token) + article_urls = [] + gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}] + for item in gzlist: + account_name = item["account_name"] + account_id = item["account_id"] + # 搜索微信公众号的接口地址 + search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' + # 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字 + query_id = { + 'action': 'search_biz', + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'query': account_name, + 'begin': '0', + 'count': '5' + } + # 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers + search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) + # 取搜索结果中的第一个公众号 + lists = search_response.json().get('list')[0] + # 获取这个公众号的fakeid,后面爬取公众号文章需要此字段 + fakeid = lists.get('fakeid') + logging.info("fakeid:" + fakeid) + # 微信公众号文章接口地址 + appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' + # 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random + query_id_data = { + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'action': 'list_ex', + 'begin': '0', # 不同页,此参数变化,变化规则为每页加5 + 'count': '5', + 'query': '', + 'fakeid': fakeid, + 'type': '9' + } + # 打开搜索的微信公众号文章列表页 + query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) + fakeid_list = query_fakeid_response.json().get('app_msg_list') + item = fakeid_list[0] + # 采集item示例 + new_article = { + 'title': item.get('title'), + 'article_url': item.get('link'), + 'account_id': account_id, + 'account_name': account_name, + 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'), + 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + logging.info("new_article:", new_article) + article_urls.append(item.get('link')) + time.sleep(2) + + for article_url in article_urls: + print("正在爬取文章:" + article_url) +