diff --git a/dsLightRag/Test/T1_Login.py b/dsLightRag/Test/T1_Login.py index 6db1e6f1..8c4c57e0 100644 --- a/dsLightRag/Test/T1_Login.py +++ b/dsLightRag/Test/T1_Login.py @@ -4,15 +4,8 @@ # 微信爬爬猫---公众号文章抓取代码分析 # https://blog.csdn.net/yajuanpi4899/article/details/121584268 -""" -安装pdfkit库 -复制 -pip3 install pdfkit -i http://pypi.douban.com/simple --trusted-host pypi.douban.com -1. -import pdfkit -pdfkit.from_url('公众号文章地址', 'out.pdf') -""" import datetime +import json import logging import random import re @@ -52,11 +45,6 @@ if __name__ == '__main__': # 用webdriver启动谷歌浏览器 logging.info("启动浏览器,打开微信公众号登录界面") options = Options() - # options.add_argument('-headless') # 无头参数,调试时可以注释掉 - - # 设置微信内置浏览器的User-Agent - options.add_argument( - '--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)') service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") driver = webdriver.Chrome(service=service, options=options) @@ -81,82 +69,10 @@ if __name__ == '__main__': exit() # cookies = json.dumps(post) # 注释掉这一行 - # 方法3:使用requests库发送请求获取重定向URL - url = 'https://mp.weixin.qq.com' - response = requests.get(url=url, allow_redirects=False, cookies=cookies) - if 'Location' in response.headers: - redirect_url = response.headers.get("Location") - print("重定向URL:", redirect_url) - token_match = re.findall(r'token=(\d+)', redirect_url) - if token_match: - token = token_match[0] - print("获取到的token:", token) - logging.info("微信token:" + token) - - article_urls = [] - gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}] - for item in gzlist: - account_name = item["account_name"] - account_id = item["account_id"] - # 搜索微信公众号的接口地址 - search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' - # 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字 - query_id = { - 'action': 'search_biz', - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'query': account_name, - 'begin': '0', - 'count': '5' - } - # 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers - search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) - # 取搜索结果中的第一个公众号 - lists = search_response.json().get('list')[0] - # 获取这个公众号的fakeid,后面爬取公众号文章需要此字段 - fakeid = lists.get('fakeid') - logging.info("fakeid:" + fakeid) - # 微信公众号文章接口地址 - appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' - # 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random - query_id_data = { - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'action': 'list_ex', - 'begin': '0', # 不同页,此参数变化,变化规则为每页加5 - 'count': '5', - 'query': '', - 'fakeid': fakeid, - 'type': '9' - } - # 打开搜索的微信公众号文章列表页 - query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) - fakeid_list = query_fakeid_response.json().get('app_msg_list') - - for item in fakeid_list: - # 采集item示例 - new_article = { - 'title': item.get('title'), - 'article_url': item.get('link'), - 'account_id': account_id, - 'account_name': account_name, - 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime( - '%Y-%m-%d %H:%M:%S'), - 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') - } - logging.info("new_article:", new_article) - article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')}) - time.sleep(1) - - for x in article_urls: - print(x) - - # 关闭浏览器 + # 将cookies写入文件 + with open('cookies.txt', mode='w', encoding="utf-8") as f: + f.write(json.dumps(cookies)) + # 关闭浏览器 driver.quit() - print("所有文章爬取完成!") + # 输出提示 + print("成功获取了cookies内容!") diff --git a/dsLightRag/Test/T2_GetList.py b/dsLightRag/Test/T2_GetList.py index 6db1e6f1..d49c40b9 100644 --- a/dsLightRag/Test/T2_GetList.py +++ b/dsLightRag/Test/T2_GetList.py @@ -13,6 +13,7 @@ import pdfkit pdfkit.from_url('公众号文章地址', 'out.pdf') """ import datetime +import json import logging import random import re @@ -38,8 +39,12 @@ from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService if __name__ == '__main__': - # 定义一个空的字典,存放cookies内容 - cookies = {} + # 从文件cookies.txt中获取 + with open('cookies.txt', 'r', encoding='utf-8') as f: + content = f.read() + # 使用json还原为json对象 + cookies = json.loads(content) + options = Options() # 设置headers - 使用微信内置浏览器的User-Agent header = { "HOST": "mp.weixin.qq.com", @@ -49,37 +54,9 @@ if __name__ == '__main__': "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4", "Connection": "keep-alive" } - # 用webdriver启动谷歌浏览器 - logging.info("启动浏览器,打开微信公众号登录界面") - options = Options() - # options.add_argument('-headless') # 无头参数,调试时可以注释掉 - - # 设置微信内置浏览器的User-Agent - options.add_argument( - '--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)') service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") driver = webdriver.Chrome(service=service, options=options) - # 打开微信公众号登录页面 - driver.get('https://mp.weixin.qq.com/') - # 等待5秒钟 - time.sleep(2) - # # 拿手机扫二维码! - logging.info("请拿手机扫码二维码登录公众号") - time.sleep(20) - - # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 - driver.get('https://mp.weixin.qq.com/') - # 获取cookies - cookie_items = driver.get_cookies() - # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 - for cookie_item in cookie_items: - cookies[cookie_item['name']] = cookie_item['value'] - - if "slave_sid" not in cookies: - logging.info("登录公众号失败,获取cookie失败") - exit() - # cookies = json.dumps(post) # 注释掉这一行 # 方法3:使用requests库发送请求获取重定向URL url = 'https://mp.weixin.qq.com' @@ -151,11 +128,17 @@ if __name__ == '__main__': 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } logging.info("new_article:", new_article) - article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')}) + article_urls.append({"title": item.get('title'), "url": item.get('link'), + "publish_time": datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime( + '%Y-%m-%d %H:%M:%S')}) time.sleep(1) for x in article_urls: print(x) + # 将返回的地址写入到文件 + with open('article_urls.txt', 'w', encoding='utf-8') as f: + for url in article_urls: + f.write(url + '\n') # 关闭浏览器 driver.quit() diff --git a/dsLightRag/Test/article_urls.txt b/dsLightRag/Test/article_urls.txt new file mode 100644 index 00000000..e69de29b diff --git a/dsLightRag/Test/cookies.txt b/dsLightRag/Test/cookies.txt new file mode 100644 index 00000000..4999d643 --- /dev/null +++ b/dsLightRag/Test/cookies.txt @@ -0,0 +1 @@ +{"_clsk": "1v8cz8t|1752541383487|1|1|mp.weixin.qq.com/weheat-agent/payload/record", "xid": "fff1911b542cde79c5c47a38cb3929c8", "data_bizuin": "3514353238", "slave_user": "gh_4f88a4e194da", "slave_sid": "cDlUaWlaek5RZHV6SUIyVWNNZlJGYTJQdHY5YzUyN29LMG94RlptUV9lbkVDUWxmaTBURFE5YWNKeVRkYlZSdU9VRnNjWXRKN2xfZ2pZd0JWal82aVpsRDhqUnJXQkdYMml4SlhrdGtGY2k2MG95YTlQVEFVanpIR01oZ3p4dldiME9hRE1zcGxZV0FlNTVV", "rand_info": "CAESIPFuk5/nui6QoQ6zEO2B5RfaUmjuQjTJOQVg9mBuI/XG", "data_ticket": "AIy4PwNlFMRBDHcZ7jcXDXf/8fFLl5NS25Nj3tYuDL8H4W8EiURU4G9Dakn7aSUC", "bizuin": "3514353238", "mm_lang": "zh_CN", "slave_bizuin": "3514353238", "uuid": "91eaae9bc5e4f725e03ee2b7e75c8a2c", "ua_id": "bbkG1LsuVI1DszGdAAAAADm2HzejXloc87mSyGEMpdY=", "wxuin": "52541365079710", "_clck": "1l32fbr|1|fxm|0"} \ No newline at end of file