dsProject/dsLightRag/Test/TestCrawl.py

# 详解（一）Python + Selenium 批量采集微信公众号，搭建自己的微信公众号每日AI简报，告别信息焦虑
# https://blog.csdn.net/k352733625/article/details/149222945

# 微信爬爬猫---公众号文章抓取代码分析
# https://blog.csdn.net/yajuanpi4899/article/details/121584268
import datetime
import logging
import random
import re
import os

import requests

"""
# 查看selenium版本
pip show selenium
4.34.2

# 查看Chrome浏览器版本
chrome://version/
138.0.7204.101 (正式版本) （64 位）

# 下载驱动包
https://googlechromelabs.github.io/chrome-for-testing/
https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip
"""
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
import json

if __name__ == '__main__':
    # 定义一个空的字典，存放cookies内容
    cookies = {}
    # 设置headers - 使用微信内置浏览器的User-Agent
    header = {
        "HOST": "mp.weixin.qq.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",
        "Connection": "keep-alive"
    }
    # 用webdriver启动谷歌浏览器
    logging.info("启动浏览器，打开微信公众号登录界面")
    options = Options()
    # options.add_argument('-headless')  # 无头参数，调试时可以注释掉

    # 设置微信内置浏览器的User-Agent
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)')

    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)
    # 打开微信公众号登录页面
    driver.get('https://mp.weixin.qq.com/')
    # 等待5秒钟
    time.sleep(2)
    # # 拿手机扫二维码！
    logging.info("请拿手机扫码二维码登录公众号")
    time.sleep(20)

    # 重新载入公众号登录页，登录之后会显示公众号后台首页，从这个返回内容中获取cookies信息
    driver.get('https://mp.weixin.qq.com/')
    # 获取cookies
    cookie_items = driver.get_cookies()
    # 获取到的cookies是列表形式，将cookies转成json形式并存入本地名为cookie的文本中
    for cookie_item in cookie_items:
        cookies[cookie_item['name']] = cookie_item['value']

    if "slave_sid" not in cookies:
        logging.info("登录公众号失败，获取cookie失败")
        exit()
    # cookies = json.dumps(post)  # 注释掉这一行

    # 方法3：使用requests库发送请求获取重定向URL
    url = 'https://mp.weixin.qq.com'
    response = requests.get(url=url, allow_redirects=False, cookies=cookies)
    if 'Location' in response.headers:
        redirect_url = response.headers.get("Location")
        print("重定向URL:", redirect_url)
        token_match = re.findall(r'token=(\d+)', redirect_url)
        if token_match:
            token = token_match[0]
            print("获取到的token:", token)
            logging.info("微信token:" + token)

    article_urls = []
    gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}]
    for item in gzlist:
        account_name = item["account_name"]
        account_id = item["account_id"]
        # 搜索微信公众号的接口地址
        search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
        # 搜索微信公众号接口需要传入的参数，有三个变量：微信公众号token、随机数random、搜索的微信公众号名字
        query_id = {
            'action': 'search_biz',
            'token': token,
            'lang': 'zh_CN',
            'f': 'json',
            'ajax': '1',
            'random': random.random(),
            'query': account_name,
            'begin': '0',
            'count': '5'
        }
        # 打开搜索微信公众号接口地址，需要传入相关参数信息如：cookies、params、headers
        search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
        # 取搜索结果中的第一个公众号
        lists = search_response.json().get('list')[0]
        # 获取这个公众号的fakeid，后面爬取公众号文章需要此字段
        fakeid = lists.get('fakeid')
        logging.info("fakeid:" + fakeid)
        # 微信公众号文章接口地址
        appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
        # 搜索文章需要传入几个参数：登录的公众号token、要爬取文章的公众号fakeid、随机数random
        query_id_data = {
            'token': token,
            'lang': 'zh_CN',
            'f': 'json',
            'ajax': '1',
            'random': random.random(),
            'action': 'list_ex',
            'begin': '0',  # 不同页，此参数变化，变化规则为每页加5
            'count': '5',
            'query': '',
            'fakeid': fakeid,
            'type': '9'
        }
        # 打开搜索的微信公众号文章列表页
        query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
        fakeid_list = query_fakeid_response.json().get('app_msg_list')

        for item in fakeid_list:
            # 采集item示例
            new_article = {
                'title': item.get('title'),
                'article_url': item.get('link'),
                'account_id': account_id,
                'account_name': account_name,
                'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'),
                'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
            print("new_article:", new_article)
            logging.info("new_article:", new_article)
            article_urls.append(item.get('link'))
            time.sleep(1)

    # 确保Logs目录存在
    logs_dir = "./Test/Logs"
    if not os.path.exists(logs_dir):
        os.makedirs(logs_dir)

    for article_url in article_urls:
        print("正在爬取文章：" + article_url)
        try:
            # 使用requests直接获取文章内容，模拟微信环境
            wechat_headers = {
                "User-Agent": "Mozilla/5.0 (Linux; Android 10; MI 8 Build/QKQ1.190828.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/83.0.4103.101 Mobile Safari/537.36 XWEB/1768 MMWEBSDK/20210302 MMWEBID/6253 MicroMessenger/8.0.2.1860(0x28000234) Process/toolsmp WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
                "X-Requested-With": "com.tencent.mm",
                "Referer": "https://mp.weixin.qq.com/"
            }

            # 使用selenium打开文章链接，设置请求头
            driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': wechat_headers})
            driver.get(article_url)
            # 增加等待时间，确保页面完全加载
            time.sleep(5)

            # 检查是否需要登录
            if "请在微信客户端中打开链接" in driver.page_source or "请在微信中打开此链接" in driver.page_source:
                print(f"文章需要在微信中打开，尝试使用requests直接获取：{article_url}")
                # 尝试使用requests直接获取
                response = requests.get(article_url, headers=wechat_headers, cookies=cookies)
                if "请在微信客户端中打开链接" in response.text or "请在微信中打开此链接" in response.text:
                    print(f"使用requests仍然无法获取，跳过此文章：{article_url}")
                    continue
                else:
                    # 保存获取到的HTML内容
                    filename = f"article_{article_url.split('sn=')[1][:10] if 'sn=' in article_url else 'unknown'}"
                    save_path = f"{logs_dir}/{filename}.html"
                    with open(save_path, "w", encoding="utf-8") as f:
                        f.write(response.text)
                    print(f"已保存文章HTML内容：{save_path}")
                    continue

            # 使用更可靠的选择器查找标题和内容
            try:
                # 尝试多种可能的标题选择器
                title_selectors = [
                    '//h1[@class="rich_media_title"]',
                    '//h1[@id="activity-name"]',
                    '//h2[@class="rich_media_title"]',
                    '//div[@class="rich_media_content"]//h1',
                    '//div[@id="js_article"]//h1'
                ]

                title = None
                for selector in title_selectors:
                    try:
                        title_element = driver.find_element('xpath', selector)
                        title = title_element.text.strip()
                        if title:
                            break
                    except:
                        continue

                if not title:
                    # 如果所有选择器都失败，尝试从页面标题获取
                    title = driver.title.replace(" - 微信公众号", "").strip()

                # 尝试多种可能的内容选择器
                content_selectors = [
                    '//div[@class="rich_media_content"]',
                    '//div[@id="js_content"]',
                    '//div[@class="rich_media_wrp"]'
                ]

                content = None
                for selector in content_selectors:
                    try:
                        content_element = driver.find_element('xpath', selector)
                        content = content_element.text.strip()
                        if content:
                            break
                    except:
                        continue

                if not content:
                    # 如果无法获取内容，至少保存页面源码
                    content = "无法提取正文内容，保存页面源码：\n" + driver.page_source

                # 创建文件名（使用标题，但去除不合法的文件名字符）
                if not title:
                    title = "未知标题_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "未知标题"

                filename = re.sub(r'[\\/:*?"<>|]', '_', title)

                # 保存文章内容到文件
                save_path = f"{logs_dir}/{filename}.txt"
                with open(save_path, "w", encoding="utf-8") as f:
                    f.write(f"标题：{title}\n\n")
                    f.write(f"链接：{article_url}\n\n")
                    f.write(f"内容：\n{content}")

                print(f"文章《{title}》保存成功：{save_path}")

            except Exception as e:
                print(f"提取文章内容失败：{str(e)}")
                # 保存页面源码以便分析
                error_filename = "error_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "error_page"
                error_path = f"{logs_dir}/{error_filename}.html"
                with open(error_path, "w", encoding="utf-8") as f:
                    f.write(driver.page_source)
                print(f"已保存页面源码到：{error_path}")

            # 避免频繁请求被封
            time.sleep(random.uniform(3, 7))

        except Exception as e:
            print(f"爬取文章失败：{article_url}，错误信息：{str(e)}")
            continue

    # 关闭浏览器
    driver.quit()
    print("所有文章爬取完成！")