From 76d0c09bddc2ac2e4909b570e4c0c055ce2b20df Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 09:09:21 +0800 Subject: [PATCH] 'commit' --- dsLightRag/.idea/dsLightRag.iml | 2 +- dsLightRag/.idea/misc.xml | 2 +- .../Test/Test/Logs/article_bfc50bb7d7.html | 162 ------------------ dsLightRag/WxGzh/T2_CollectArticle.py | 41 ++--- 4 files changed, 19 insertions(+), 188 deletions(-) delete mode 100644 dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html diff --git a/dsLightRag/.idea/dsLightRag.iml b/dsLightRag/.idea/dsLightRag.iml index 4ceb6f94..880d61c1 100644 --- a/dsLightRag/.idea/dsLightRag.iml +++ b/dsLightRag/.idea/dsLightRag.iml @@ -2,7 +2,7 @@ - + diff --git a/dsLightRag/.idea/misc.xml b/dsLightRag/.idea/misc.xml index 0bad5868..0f9b3bc1 100644 --- a/dsLightRag/.idea/misc.xml +++ b/dsLightRag/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html b/dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html deleted file mode 100644 index cd460649..00000000 --- a/dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html +++ /dev/null @@ -1,162 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - -
- -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index a6e073bb..de048fb6 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -4,21 +4,34 @@ # 微信爬爬猫---公众号文章抓取代码分析 # https://blog.csdn.net/yajuanpi4899/article/details/121584268 +""" +# 查看selenium版本 +pip show selenium +4.34.2 +# 查看Chrome浏览器版本 +chrome://version/ +138.0.7204.101 (正式版本) (64 位) + +# 下载驱动包 +https://googlechromelabs.github.io/chrome-for-testing/ +https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip +""" import asyncio import datetime import json import logging import random import re - +import time +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService import requests - from Util.PostgreSQLUtil import init_postgres_pool from Util.WxGzhUtil import init_wechat_browser, get_article_content # 删除重复的日志配置,只保留以下内容 -logger = logging.getLogger('WeiXinGongZhongHao') +logger = logging.getLogger('WxGzh') logger.setLevel(logging.INFO) # 确保只添加一个handler @@ -27,7 +40,6 @@ if not logger.handlers: handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(handler) - async def get_wechat_sources(): """从t_wechat_source表获取微信公众号列表""" try: @@ -39,32 +51,13 @@ async def get_wechat_sources(): await pool.close() -""" -# 查看selenium版本 -pip show selenium -4.34.2 - -# 查看Chrome浏览器版本 -chrome://version/ -138.0.7204.101 (正式版本) (64 位) - -# 下载驱动包 -https://googlechromelabs.github.io/chrome-for-testing/ -https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip -""" -import time -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.chrome.service import Service as ChromeService - async def is_article_exist(pool, article_url): """检查文章URL是否已存在数据库中""" try: async with pool.acquire() as conn: row = await conn.fetchrow(''' - SELECT 1 - FROM t_wechat_articles - WHERE url = $1 LIMIT 1 + SELECT 1 FROM t_wechat_articles WHERE url = $1 LIMIT 1 ''', article_url) return row is not None except Exception as e: