From 8bfd82e6de7ffe432b3674d2ab764e823f841b58 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 14 Jul 2025 14:32:49 +0800 Subject: [PATCH] 'commit' --- dsLightRag/.idea/dsLightRag.iml | 2 +- dsLightRag/.idea/misc.xml | 2 +- dsLightRag/Test/TestCrawl.py | 67 +++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 dsLightRag/Test/TestCrawl.py diff --git a/dsLightRag/.idea/dsLightRag.iml b/dsLightRag/.idea/dsLightRag.iml index 4ceb6f94..880d61c1 100644 --- a/dsLightRag/.idea/dsLightRag.iml +++ b/dsLightRag/.idea/dsLightRag.iml @@ -2,7 +2,7 @@ - + diff --git a/dsLightRag/.idea/misc.xml b/dsLightRag/.idea/misc.xml index 0bad5868..0f9b3bc1 100644 --- a/dsLightRag/.idea/misc.xml +++ b/dsLightRag/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/dsLightRag/Test/TestCrawl.py b/dsLightRag/Test/TestCrawl.py new file mode 100644 index 00000000..b9d6ff78 --- /dev/null +++ b/dsLightRag/Test/TestCrawl.py @@ -0,0 +1,67 @@ +# 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 +# https://blog.csdn.net/k352733625/article/details/149222945 +import logging +# 1、安装Firefox软件【最新】 +# https://www.firefox.com.cn/download/#product-desktop-release + +# 2、下载geckodriver驱动【最新】 +# https://splinter-docs-zh-cn.readthedocs.io/zh/latest/drivers/firefox.html +# https://github.com/mozilla/geckodriver/releases + +""" +# 查看selenium版本 +pip show selenium +4.34.2 + +# 查看Chrome浏览器版本 +chrome://version/ +138.0.7204.101 (正式版本) (64 位) + +# 下载驱动包 +https://googlechromelabs.github.io/chrome-for-testing/ +https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip +""" +import time, random, re, json, requests +from selenium import webdriver +from selenium.webdriver import Chrome +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service as ChromeService +import requests +import json +import datetime + + +def weChat_login(): + # 定义一个空的字典,存放cookies内容 + post = {} + # 用webdriver启动谷歌浏览器 + logging.info("启动浏览器,打开微信公众号登录界面") + options = Options() + options.add_argument('-headless') # 无头参数 + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + driver = webdriver.Chrome(service=service) + # 打开微信公众号登录页面 + driver.get('https://mp.weixin.qq.com/') + # 等待5秒钟 + time.sleep(2) + # # 拿手机扫二维码! + logging.info("请拿手机扫码二维码登录公众号") + time.sleep(20) + # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 + driver.get('https://mp.weixin.qq.com/') + # 获取cookies + cookie_items = driver.get_cookies() + # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 + for cookie_item in cookie_items: + post[cookie_item['name']] = cookie_item['value'] + + if "slave_sid" not in post: + logging.info("登录公众号失败,获取cookie失败") + return None + cookie_str = json.dumps(post) + return cookie_str + +if __name__ == '__main__': + cookie_str = weChat_login() + print(cookie_str) \ No newline at end of file