'commit'

1 week ago · 69e8e833e6
parent 9ed09f13d6
commit 69e8e833e6
2 changed files with 295 additions and 15 deletions
--- a/dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html
+++ b/dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html
--- a/dsLightRag/Test/TestCrawl.py
+++ b/dsLightRag/Test/TestCrawl.py
@ -7,20 +7,10 @@ import datetime
 import logging
 import random
 import re
 import os
 import requests
 # 1、安装Firefox软件【最新】
 # https://www.firefox.com.cn/download/#product-desktop-release
 # 2、下载geckodriver驱动【最新】
 # https://splinter-docs-zh-cn.readthedocs.io/zh/latest/drivers/firefox.html
 # https://github.com/mozilla/geckodriver/releases
 # 3、Python爬虫实战系列：微信公众号文章爬取的5种技术方案总结及代码示例！
 # 方案5：微信公众号后台引用链接方式爬取
 # https://blog.csdn.net/Python_trys/article/details/146506009
 """
 # 查看selenium版本
 pip show selenium
@ -43,17 +33,25 @@ import json
 if __name__ == '__main__':
    # 定义一个空的字典，存放cookies内容
    cookies = {}
-    # 设置headers
+    # 设置headers - 使用微信内置浏览器的User-Agent
    header = {
        "HOST": "mp.weixin.qq.com",
-        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0"
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4",
        "Connection": "keep-alive"
    }
    # 用webdriver启动谷歌浏览器
    logging.info("启动浏览器，打开微信公众号登录界面")
    options = Options()
-    options.add_argument('-headless')  # 无头参数
+    # options.add_argument('-headless')  # 无头参数，调试时可以注释掉
    # 设置微信内置浏览器的User-Agent
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)')
    service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
-    driver = webdriver.Chrome(service=service)
+    driver = webdriver.Chrome(service=service, options=options)
    # 打开微信公众号登录页面
    driver.get('https://mp.weixin.qq.com/')
    # 等待5秒钟
@ -143,10 +141,130 @@ if __name__ == '__main__':
                'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'),
                'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
            print("new_article:", new_article)
            logging.info("new_article:", new_article)
            article_urls.append(item.get('link'))
            time.sleep(1)
    # 确保Logs目录存在
    logs_dir = "./Test/Logs"
    if not os.path.exists(logs_dir):
        os.makedirs(logs_dir)
    for article_url in article_urls:
        print("正在爬取文章：" + article_url)
        try:
            # 使用requests直接获取文章内容，模拟微信环境
            wechat_headers = {
                "User-Agent": "Mozilla/5.0 (Linux; Android 10; MI 8 Build/QKQ1.190828.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/83.0.4103.101 Mobile Safari/537.36 XWEB/1768 MMWEBSDK/20210302 MMWEBID/6253 MicroMessenger/8.0.2.1860(0x28000234) Process/toolsmp WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
                "X-Requested-With": "com.tencent.mm",
                "Referer": "https://mp.weixin.qq.com/"
            }
            # 使用selenium打开文章链接，设置请求头
            driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': wechat_headers})
            driver.get(article_url)
            # 增加等待时间，确保页面完全加载
            time.sleep(5)
            # 检查是否需要登录
            if "请在微信客户端中打开链接" in driver.page_source or "请在微信中打开此链接" in driver.page_source:
                print(f"文章需要在微信中打开，尝试使用requests直接获取：{article_url}")
                # 尝试使用requests直接获取
                response = requests.get(article_url, headers=wechat_headers, cookies=cookies)
                if "请在微信客户端中打开链接" in response.text or "请在微信中打开此链接" in response.text:
                    print(f"使用requests仍然无法获取，跳过此文章：{article_url}")
                    continue
                else:
                    # 保存获取到的HTML内容
                    filename = f"article_{article_url.split('sn=')[1][:10] if 'sn=' in article_url else 'unknown'}"
                    save_path = f"{logs_dir}/{filename}.html"
                    with open(save_path, "w", encoding="utf-8") as f:
                        f.write(response.text)
                    print(f"已保存文章HTML内容：{save_path}")
                    continue
            # 使用更可靠的选择器查找标题和内容
            try:
                # 尝试多种可能的标题选择器
                title_selectors = [
                    '//h1[@class="rich_media_title"]', 
                    '//h1[@id="activity-name"]',
                    '//h2[@class="rich_media_title"]',
                    '//div[@class="rich_media_content"]//h1',
                    '//div[@id="js_article"]//h1'
                ]
                title = None
                for selector in title_selectors:
                    try:
                        title_element = driver.find_element('xpath', selector)
                        title = title_element.text.strip()
                        if title:
                            break
                    except:
                        continue
                if not title:
                    # 如果所有选择器都失败，尝试从页面标题获取
                    title = driver.title.replace(" - 微信公众号", "").strip()
                # 尝试多种可能的内容选择器
                content_selectors = [
                    '//div[@class="rich_media_content"]',
                    '//div[@id="js_content"]',
                    '//div[@class="rich_media_wrp"]'
                ]
                content = None
                for selector in content_selectors:
                    try:
                        content_element = driver.find_element('xpath', selector)
                        content = content_element.text.strip()
                        if content:
                            break
                    except:
                        continue
                if not content:
                    # 如果无法获取内容，至少保存页面源码
                    content = "无法提取正文内容，保存页面源码：\n" + driver.page_source
                # 创建文件名（使用标题，但去除不合法的文件名字符）
                if not title:
                    title = "未知标题_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "未知标题"
                filename = re.sub(r'[\\/:*?"<>|]', '_', title)
                # 保存文章内容到文件
                save_path = f"{logs_dir}/{filename}.txt"
                with open(save_path, "w", encoding="utf-8") as f:
                    f.write(f"标题：{title}\n\n")
                    f.write(f"链接：{article_url}\n\n")
                    f.write(f"内容：\n{content}")
                print(f"文章《{title}》保存成功：{save_path}")
            except Exception as e:
                print(f"提取文章内容失败：{str(e)}")
                # 保存页面源码以便分析
                error_filename = "error_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "error_page"
                error_path = f"{logs_dir}/{error_filename}.html"
                with open(error_path, "w", encoding="utf-8") as f:
                    f.write(driver.page_source)
                print(f"已保存页面源码到：{error_path}")
            # 避免频繁请求被封
            time.sleep(random.uniform(3, 7))
        except Exception as e:
            print(f"爬取文章失败：{article_url}，错误信息：{str(e)}")
            continue
    # 关闭浏览器
    driver.quit()
    print("所有文章爬取完成！")