dsProject/dsLightRag/Test/TestCrawl.py

# 详解（一）Python + Selenium 批量采集微信公众号，搭建自己的微信公众号每日AI简报，告别信息焦虑
# https://blog.csdn.net/k352733625/article/details/149222945
import logging
import re

import requests

# 1、安装Firefox软件【最新】
# https://www.firefox.com.cn/download/#product-desktop-release

# 2、下载geckodriver驱动【最新】
# https://splinter-docs-zh-cn.readthedocs.io/zh/latest/drivers/firefox.html
# https://github.com/mozilla/geckodriver/releases

"""
# 查看selenium版本
pip show selenium
4.34.2

# 查看Chrome浏览器版本
chrome://version/
138.0.7204.101 (正式版本) （64 位）

# 下载驱动包
https://googlechromelabs.github.io/chrome-for-testing/
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
"""
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
import json


if __name__ == '__main__':
    # 定义一个空的字典，存放cookies内容
    post = {}
    # 用webdriver启动谷歌浏览器
    logging.info("启动浏览器，打开微信公众号登录界面")
    options = Options()
    options.add_argument('-headless')  # 无头参数
    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
    driver = webdriver.Chrome(service=service)
    # 打开微信公众号登录页面
    driver.get('https://mp.weixin.qq.com/')
    # 等待5秒钟
    time.sleep(2)
    # # 拿手机扫二维码！
    logging.info("请拿手机扫码二维码登录公众号")
    time.sleep(20)
    # 重新载入公众号登录页，登录之后会显示公众号后台首页，从这个返回内容中获取cookies信息
    driver.get('https://mp.weixin.qq.com/')
    # 获取cookies
    cookie_items = driver.get_cookies()
    # 获取到的cookies是列表形式，将cookies转成json形式并存入本地名为cookie的文本中
    for cookie_item in cookie_items:
        post[cookie_item['name']] = cookie_item['value']

    if "slave_sid" not in post:
        logging.info("登录公众号失败，获取cookie失败")
        exit()
    cookies = json.dumps(post)

    # 方法3：使用requests库发送请求获取重定向URL
    url = 'https://mp.weixin.qq.com'
    response = requests.get(url=url, allow_redirects=False, cookies=post)
    if 'Location' in response.headers:
        redirect_url = response.headers.get("Location")
        print("重定向URL:", redirect_url)
        token_match = re.findall(r'token=(\d+)', redirect_url)
        if token_match:
            token = token_match[0]
            print("获取到的token:", token)
            logging.info("微信token:" + token)

    #url = 'https://mp.weixin.qq.com'
    #response = requests.get(url=url, allow_redirects=False, cookies=cookies)
    #token = re.findall(r'token=(\d+)', str(response.headers.get("Location")))[0]
    #logging.info("微信token:" + token)