From b4d13486ac77d6810b022f1edcc88f01f119c706 Mon Sep 17 00:00:00 2001 From: "Kalman.CHENG" <123204464@qq.com> Date: Wed, 16 Jul 2025 08:32:10 +0800 Subject: [PATCH 01/17] =?UTF-8?q?=E6=95=99=E8=82=B2=E5=9E=82=E7=9B=B4?= =?UTF-8?q?=E9=A2=86=E5=9F=9F=E5=A4=A7=E6=A8=A1=E5=9E=8B=E5=B9=B3=E5=8F=B0?= =?UTF-8?q?=20modify=20by=20Kalman.CHENG=20=E2=98=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../{QuestionController.py => TeachingModelController.py} | 2 +- dsAiTeachingModel/main.py | 2 +- dsAiTeachingModel/routes/__init__.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) rename dsAiTeachingModel/api/controller/{QuestionController.py => TeachingModelController.py} (90%) diff --git a/dsAiTeachingModel/api/controller/QuestionController.py b/dsAiTeachingModel/api/controller/TeachingModelController.py similarity index 90% rename from dsAiTeachingModel/api/controller/QuestionController.py rename to dsAiTeachingModel/api/controller/TeachingModelController.py index 48b7ed39..881472e5 100644 --- a/dsAiTeachingModel/api/controller/QuestionController.py +++ b/dsAiTeachingModel/api/controller/TeachingModelController.py @@ -1,4 +1,4 @@ -# routes/QuestionController.py +# routes/TeachingModelController.py from fastapi import APIRouter, Request, Response, Depends from auth.dependencies import * diff --git a/dsAiTeachingModel/main.py b/dsAiTeachingModel/main.py index 8f99a901..d7b3cfdc 100644 --- a/dsAiTeachingModel/main.py +++ b/dsAiTeachingModel/main.py @@ -52,7 +52,7 @@ app.include_router(theme_router, prefix="/api/theme", tags=["theme"]) # 文档相关 app.include_router(document_router, prefix="/api/document", tags=["document"]) # 问题相关(大模型应用) -app.include_router(question_router, prefix="/api/question", tags=["question"]) +app.include_router(teaching_model_router, prefix="/api/teaching/model", tags=["question"]) # 字典相关(Dm) app.include_router(dm_router, prefix="/api/dm", tags=["dm"]) # 测试相关 diff --git a/dsAiTeachingModel/routes/__init__.py b/dsAiTeachingModel/routes/__init__.py index 4fa720b9..f985e11c 100644 --- a/dsAiTeachingModel/routes/__init__.py +++ b/dsAiTeachingModel/routes/__init__.py @@ -2,10 +2,10 @@ from api.controller.LoginController import router as login_router from api.controller.DocumentController import router as document_router from api.controller.ThemeController import router as theme_router -from api.controller.QuestionController import router as question_router +from api.controller.TeachingModelController import router as teaching_model_router from api.controller.TestController import router as test_router from api.controller.DmController import router as dm_router from api.controller.UserController import router as user_router # 导出所有路由 -__all__ = ["login_router", "document_router", "theme_router", "question_router", "dm_router", "test_router", "user_router"] +__all__ = ["login_router", "document_router", "theme_router", "teaching_model_router", "dm_router", "test_router", "user_router"] From ea48a15de7d3b4eb794b41cc2c713cb0c8ed815d Mon Sep 17 00:00:00 2001 From: "Kalman.CHENG" <123204464@qq.com> Date: Wed, 16 Jul 2025 08:37:02 +0800 Subject: [PATCH 02/17] =?UTF-8?q?=E6=95=99=E8=82=B2=E5=9E=82=E7=9B=B4?= =?UTF-8?q?=E9=A2=86=E5=9F=9F=E5=A4=A7=E6=A8=A1=E5=9E=8B=E5=B9=B3=E5=8F=B0?= =?UTF-8?q?=20modify=20by=20Kalman.CHENG=20=E2=98=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dsAiTeachingModel/{config => config1}/Config.py | 0 dsAiTeachingModel/{config => config1}/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename dsAiTeachingModel/{config => config1}/Config.py (100%) rename dsAiTeachingModel/{config => config1}/__init__.py (100%) diff --git a/dsAiTeachingModel/config/Config.py b/dsAiTeachingModel/config1/Config.py similarity index 100% rename from dsAiTeachingModel/config/Config.py rename to dsAiTeachingModel/config1/Config.py diff --git a/dsAiTeachingModel/config/__init__.py b/dsAiTeachingModel/config1/__init__.py similarity index 100% rename from dsAiTeachingModel/config/__init__.py rename to dsAiTeachingModel/config1/__init__.py From 9ecaf901641b502992be68c4c2a525d654e69537 Mon Sep 17 00:00:00 2001 From: "Kalman.CHENG" <123204464@qq.com> Date: Wed, 16 Jul 2025 08:37:30 +0800 Subject: [PATCH 03/17] =?UTF-8?q?=E6=95=99=E8=82=B2=E5=9E=82=E7=9B=B4?= =?UTF-8?q?=E9=A2=86=E5=9F=9F=E5=A4=A7=E6=A8=A1=E5=9E=8B=E5=B9=B3=E5=8F=B0?= =?UTF-8?q?=20modify=20by=20Kalman.CHENG=20=E2=98=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dsAiTeachingModel/{config1 => Config}/Config.py | 0 dsAiTeachingModel/{config1 => Config}/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename dsAiTeachingModel/{config1 => Config}/Config.py (100%) rename dsAiTeachingModel/{config1 => Config}/__init__.py (100%) diff --git a/dsAiTeachingModel/config1/Config.py b/dsAiTeachingModel/Config/Config.py similarity index 100% rename from dsAiTeachingModel/config1/Config.py rename to dsAiTeachingModel/Config/Config.py diff --git a/dsAiTeachingModel/config1/__init__.py b/dsAiTeachingModel/Config/__init__.py similarity index 100% rename from dsAiTeachingModel/config1/__init__.py rename to dsAiTeachingModel/Config/__init__.py From 75c06f31eb7974bef57066b6a11d15e785670b54 Mon Sep 17 00:00:00 2001 From: "Kalman.CHENG" <123204464@qq.com> Date: Wed, 16 Jul 2025 09:06:08 +0800 Subject: [PATCH 04/17] =?UTF-8?q?=E6=95=99=E8=82=B2=E5=9E=82=E7=9B=B4?= =?UTF-8?q?=E9=A2=86=E5=9F=9F=E5=A4=A7=E6=A8=A1=E5=9E=8B=E5=B9=B3=E5=8F=B0?= =?UTF-8?q?=20modify=20by=20Kalman.CHENG=20=E2=98=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dsAiTeachingModel/main.py | 2 +- dsAiTeachingModel/tasks/BackgroundTasks.py | 6 ++++-- dsAiTeachingModel/utils/Database.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dsAiTeachingModel/main.py b/dsAiTeachingModel/main.py index d7b3cfdc..72613efa 100644 --- a/dsAiTeachingModel/main.py +++ b/dsAiTeachingModel/main.py @@ -23,7 +23,7 @@ async def lifespan(app: FastAPI): await init_database() # 启动异步任务 - asyncio.create_task(train_document_task()) + # asyncio.create_task(train_document_task()) yield await shutdown_database() diff --git a/dsAiTeachingModel/tasks/BackgroundTasks.py b/dsAiTeachingModel/tasks/BackgroundTasks.py index d43dc190..b5eb4a24 100644 --- a/dsAiTeachingModel/tasks/BackgroundTasks.py +++ b/dsAiTeachingModel/tasks/BackgroundTasks.py @@ -12,11 +12,13 @@ WORKING_DIR = f"./output" # 后台任务,监控是否有新的未训练的文档进行训练 async def train_document_task(): print("线程5秒后开始运行【监控是否有新的未训练的文档进行训练】") + num = 1 await asyncio.sleep(5) # 使用 asyncio.sleep 而不是 time.sleep # 这里放置你的线程逻辑 while True: # 这里可以放置你的线程要执行的代码 - logging.info("开始查询是否有未训练的文档") + logging.info("开始查询是否有未训练的文档:" + str(num)) + num = num + 1 no_train_document_sql: str = " SELECT * FROM t_ai_teaching_model_document WHERE is_deleted = 0 and train_flag = 0 ORDER BY create_time DESC" no_train_document_result = await find_by_sql(no_train_document_sql, ()) if not no_train_document_result: @@ -49,4 +51,4 @@ async def train_document_task(): # execute_sql(update_sql) # 添加适当的等待时间,避免频繁查询 - await asyncio.sleep(60) # 每分钟查询一次 + await asyncio.sleep(60) # 每分钟查询一次 diff --git a/dsAiTeachingModel/utils/Database.py b/dsAiTeachingModel/utils/Database.py index 4ac15243..4010390d 100644 --- a/dsAiTeachingModel/utils/Database.py +++ b/dsAiTeachingModel/utils/Database.py @@ -17,7 +17,7 @@ async def create_pool(): password=POSTGRES_PASSWORD, database=POSTGRES_DATABASE, min_size=1, # 设置连接池最小连接数 - max_size=100 # 设置连接池最大连接数 + max_size=10 # 设置连接池最大连接数 ) async def get_connection(): From 76d0c09bddc2ac2e4909b570e4c0c055ce2b20df Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 09:09:21 +0800 Subject: [PATCH 05/17] 'commit' --- dsLightRag/.idea/dsLightRag.iml | 2 +- dsLightRag/.idea/misc.xml | 2 +- .../Test/Test/Logs/article_bfc50bb7d7.html | 162 ------------------ dsLightRag/WxGzh/T2_CollectArticle.py | 41 ++--- 4 files changed, 19 insertions(+), 188 deletions(-) delete mode 100644 dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html diff --git a/dsLightRag/.idea/dsLightRag.iml b/dsLightRag/.idea/dsLightRag.iml index 4ceb6f94..880d61c1 100644 --- a/dsLightRag/.idea/dsLightRag.iml +++ b/dsLightRag/.idea/dsLightRag.iml @@ -2,7 +2,7 @@ - + diff --git a/dsLightRag/.idea/misc.xml b/dsLightRag/.idea/misc.xml index 0bad5868..0f9b3bc1 100644 --- a/dsLightRag/.idea/misc.xml +++ b/dsLightRag/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html b/dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html deleted file mode 100644 index cd460649..00000000 --- a/dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html +++ /dev/null @@ -1,162 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - -
- -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index a6e073bb..de048fb6 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -4,21 +4,34 @@ # 微信爬爬猫---公众号文章抓取代码分析 # https://blog.csdn.net/yajuanpi4899/article/details/121584268 +""" +# 查看selenium版本 +pip show selenium +4.34.2 +# 查看Chrome浏览器版本 +chrome://version/ +138.0.7204.101 (正式版本) (64 位) + +# 下载驱动包 +https://googlechromelabs.github.io/chrome-for-testing/ +https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip +""" import asyncio import datetime import json import logging import random import re - +import time +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService import requests - from Util.PostgreSQLUtil import init_postgres_pool from Util.WxGzhUtil import init_wechat_browser, get_article_content # 删除重复的日志配置,只保留以下内容 -logger = logging.getLogger('WeiXinGongZhongHao') +logger = logging.getLogger('WxGzh') logger.setLevel(logging.INFO) # 确保只添加一个handler @@ -27,7 +40,6 @@ if not logger.handlers: handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(handler) - async def get_wechat_sources(): """从t_wechat_source表获取微信公众号列表""" try: @@ -39,32 +51,13 @@ async def get_wechat_sources(): await pool.close() -""" -# 查看selenium版本 -pip show selenium -4.34.2 - -# 查看Chrome浏览器版本 -chrome://version/ -138.0.7204.101 (正式版本) (64 位) - -# 下载驱动包 -https://googlechromelabs.github.io/chrome-for-testing/ -https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip -""" -import time -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.chrome.service import Service as ChromeService - async def is_article_exist(pool, article_url): """检查文章URL是否已存在数据库中""" try: async with pool.acquire() as conn: row = await conn.fetchrow(''' - SELECT 1 - FROM t_wechat_articles - WHERE url = $1 LIMIT 1 + SELECT 1 FROM t_wechat_articles WHERE url = $1 LIMIT 1 ''', article_url) return row is not None except Exception as e: From 642d3af0eaf0948449dd86778ead371918c553fe Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 09:10:35 +0800 Subject: [PATCH 06/17] 'commit' --- dsLightRag/WxGzh/T2_CollectArticle.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index de048fb6..d39b3539 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -89,7 +89,6 @@ if __name__ == '__main__': content = f.read() # 使用json还原为json对象 cookies = json.loads(content) - # "expiry": 1787106233 # 检查是否有过期时间 expiry = cookies["expiry"] if expiry: @@ -108,7 +107,7 @@ if __name__ == '__main__': logger.info(f"cookies的过期时间一般是4天,cookies过期时间:%s" % expiry_date) options = Options() options.add_argument('-headless') # 无头参数,调试时可以注释掉 - # 设置headers - 使用微信内置浏览器的User-Agent + # 设置headers header = { "HOST": "mp.weixin.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)", @@ -121,8 +120,7 @@ if __name__ == '__main__': service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") # 使用统一的初始化方式 driver = init_wechat_browser() - - # 方法3:使用requests库发送请求获取重定向URL + # 方法:使用requests库发送请求获取重定向URL url = 'https://mp.weixin.qq.com' response = requests.get(url=url, allow_redirects=False, cookies=cookies) if 'Location' in response.headers: @@ -194,7 +192,7 @@ if __name__ == '__main__': article_title = item.get('title') publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))) - if '试卷' in article_title: # 过滤掉试卷 + if '试卷' in article_title: # 过滤掉试卷,致知物理中有大量试卷,我做教育资讯的不关心试卷 continue logger.info(f"正在处理文章: {article_title} ({publish_time})") From 2e40111220da01e961c931bd2e802a167ec6e39d Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 09:11:14 +0800 Subject: [PATCH 07/17] 'commit' --- dsLightRag/WxGzh/T2_CollectArticle.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index d39b3539..23b89728 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -40,6 +40,7 @@ if not logger.handlers: handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(handler) + async def get_wechat_sources(): """从t_wechat_source表获取微信公众号列表""" try: @@ -51,13 +52,14 @@ async def get_wechat_sources(): await pool.close() - async def is_article_exist(pool, article_url): """检查文章URL是否已存在数据库中""" try: async with pool.acquire() as conn: row = await conn.fetchrow(''' - SELECT 1 FROM t_wechat_articles WHERE url = $1 LIMIT 1 + SELECT 1 + FROM t_wechat_articles + WHERE url = $1 LIMIT 1 ''', article_url) return row is not None except Exception as e: @@ -125,11 +127,11 @@ if __name__ == '__main__': response = requests.get(url=url, allow_redirects=False, cookies=cookies) if 'Location' in response.headers: redirect_url = response.headers.get("Location") - logger.info(f"重定向URL:%s"%redirect_url) + logger.info(f"重定向URL:%s" % redirect_url) token_match = re.findall(r'token=(\d+)', redirect_url) if token_match: token = token_match[0] - logger.info(f"获取到的token:%s"%token) + logger.info(f"获取到的token:%s" % token) article_urls = [] @@ -203,8 +205,7 @@ if __name__ == '__main__': try: pool = loop.run_until_complete(init_postgres_pool()) loop.run_until_complete( - save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, - id)) + save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, id)) finally: loop.run_until_complete(pool.close()) loop.close() From 1180c615d65fe52ac4244dbad15abf080865250b Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 09:12:47 +0800 Subject: [PATCH 08/17] 'commit' --- dsLightRag/WxGzh/T2_CollectArticle.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index 23b89728..8aa7476a 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -67,7 +67,7 @@ async def is_article_exist(pool, article_url): return False # 出错时默认返回False,避免影响正常流程 -async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, id): +async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, source_id): # 先检查文章是否已存在 if await is_article_exist(pool, article_url): logger.info(f"文章已存在,跳过保存: {article_url}") @@ -80,7 +80,7 @@ async def save_article_to_db(pool, article_title, account_name, article_url, pub (title, source, url, publish_time, content, source_id) VALUES ($1, $2, $3, $4, $5, $6) ''', article_title, account_name, article_url, - publish_time, content, id) + publish_time, content, source_id) except Exception as e: logging.error(f"保存文章失败: {e}") @@ -198,7 +198,10 @@ if __name__ == '__main__': continue logger.info(f"正在处理文章: {article_title} ({publish_time})") + + logger.info(f"正在获取文章: {article_title}内容...") content = get_article_content(article_url) + logger.info(f"成功获取文章: {article_title}内容。") loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) From 1c00fc476180b8407dbd8da026d0a0504ba066de Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 09:13:35 +0800 Subject: [PATCH 09/17] 'commit' --- dsLightRag/WxGzh/T2_CollectArticle.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index 8aa7476a..26ae2377 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -72,6 +72,8 @@ async def save_article_to_db(pool, article_title, account_name, article_url, pub if await is_article_exist(pool, article_url): logger.info(f"文章已存在,跳过保存: {article_url}") return + # 准备在这里调用 lightrag进行知识库构建 + # TODO try: async with pool.acquire() as conn: From d2a79f0c939317d335aa977e53c2f3010e290b08 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 09:18:27 +0800 Subject: [PATCH 10/17] 'commit' --- dsLightRag/WxGzh/T2_CollectArticle.py | 20 +++++++-- dsLightRag/WxGzh/T3_TrainIntoKG.py | 63 --------------------------- 2 files changed, 17 insertions(+), 66 deletions(-) delete mode 100644 dsLightRag/WxGzh/T3_TrainIntoKG.py diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index 26ae2377..0338bae8 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -27,6 +27,8 @@ import time from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService import requests + +from Util.LightRagUtil import initialize_pg_rag from Util.PostgreSQLUtil import init_postgres_pool from Util.WxGzhUtil import init_wechat_browser, get_article_content @@ -72,9 +74,21 @@ async def save_article_to_db(pool, article_title, account_name, article_url, pub if await is_article_exist(pool, article_url): logger.info(f"文章已存在,跳过保存: {article_url}") return - # 准备在这里调用 lightrag进行知识库构建 - # TODO - + # 在这里调用 lightrag进行知识库构建 + workspace = 'ChangChun' + # 使用PG库后,这个是没有用的,但目前的项目代码要求必传,就写一个吧。 + WORKING_DIR = f"./output" + docx_name = f"{account_name}_{article_title}" # 组合来源和标题作为文档名 + logger.info(f"开始处理文档: {docx_name}") + try: + rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace) + await rag.ainsert(input=content, file_paths=[docx_name]) + finally: + if rag: + await rag.finalize_storages() + if pool: + await pool.close() + logger.info(f"保存文档到知识库成功: {docx_name}") try: async with pool.acquire() as conn: await conn.execute(''' diff --git a/dsLightRag/WxGzh/T3_TrainIntoKG.py b/dsLightRag/WxGzh/T3_TrainIntoKG.py deleted file mode 100644 index 86473413..00000000 --- a/dsLightRag/WxGzh/T3_TrainIntoKG.py +++ /dev/null @@ -1,63 +0,0 @@ -import asyncio -import logging - -from Util.DocxUtil import get_docx_content_by_pandoc -from Util.LightRagUtil import initialize_pg_rag -from Util.PostgreSQLUtil import init_postgres_pool - -logger = logging.getLogger('lightrag') -logger.setLevel(logging.INFO) -handler = logging.StreamHandler() -handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) -logger.addHandler(handler) -logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) - -# 使用PG库后,这个是没有用的,但目前的项目代码要求必传,就写一个吧。 -WORKING_DIR = f"./output" - - -async def get_unprocessed_articles(): - """从t_wechat_articles表获取未处理的文章""" - try: - pool = await init_postgres_pool() - async with pool.acquire() as conn: - rows = await conn.fetch(''' - SELECT id, source, title, content - FROM t_wechat_articles - WHERE is_finish = 0 - ''') - return [dict(row) for row in rows] - finally: - await pool.close() - -async def main(): - # 获取未处理的文章 - articles = await get_unprocessed_articles() - logger.info(f"共获取到{len(articles)}篇未处理的文章") - - for article in articles: - workspace = 'ChangChun' - docx_name = f"{article['source']}_{article['title']}" # 组合来源和标题作为文档名 - content = article["content"] # 使用文章内容 - - logger.info(f"开始处理文档: {docx_name}") - try: - rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace) - await rag.ainsert(input=content, file_paths=[docx_name]) - - # 标记为已处理 - pool = await init_postgres_pool() - async with pool.acquire() as conn: - await conn.execute(''' - UPDATE t_wechat_articles - SET is_finish = 1 - WHERE id = $1 - ''', article["id"]) - finally: - if rag: - await rag.finalize_storages() - if pool: - await pool.close() - -if __name__ == "__main__": - asyncio.run(main()) From db0365b16d3844aa5c0e2903e310650ff46250be Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 09:23:30 +0800 Subject: [PATCH 11/17] 'commit' --- .../Config/__pycache__/Config.cpython-310.pyc | Bin 888 -> 888 bytes .../__pycache__/LightRagUtil.cpython-310.pyc | Bin 4511 -> 4511 bytes .../PostgreSQLUtil.cpython-310.pyc | Bin 1565 -> 1565 bytes 3 files changed, 0 insertions(+), 0 deletions(-) diff --git a/dsLightRag/Config/__pycache__/Config.cpython-310.pyc b/dsLightRag/Config/__pycache__/Config.cpython-310.pyc index c551f7f6a39e4d9881c562acb9a358f2569fd42f..73bb1c973c7031619f11244f5f7461787f147b36 100644 GIT binary patch delta 19 Zcmeyt_JfTppO=@50SGcbZRE;h1^_n;1t9)x4E9o>n_(D7h$WO7hITq_bhOySei$XejArjybhntdEvi5s3NLKgefLk!45zRc@VyMSJuHI(n!F_0xzM#4KVC5&DN7JSEvD zaT7`(lOpV=D6^S>%x$o2z!FnvUPLT8eH;gF3(FwJIm)xyo>MF#jc^5KNfns#SLx|h zG!ikFO5ZtYkM!nJ39FR1Rl8B|=;T}~g`IU|_EH|W;&g#+6S-P76vHZO)%NiE{>wNS zwZ3nb2>T|7Vfeq~SOMRK_;;I~ha$aYs{{h3?C5CLiQLpm@R%JXiKq?RQVi9EL3EUO zMRm6EQ7|@%1-0Lh$5Rfmiom3F4dDjF-7T?@T}v-!Qc7ZRjVfr!J8$+QxRg-Qr7M$( zbMU2Ml%g~aKhP_zG)(?-$uhU~3dOZ*dCi7V+i!K5HR}8uaN>m7p=oINLui6H3+b-g zlsaqTsp@sRcq@NjFhN5O)#x5cd%(4E4cZz`zz% delta 1003 zcmYL|$xjne7{EJBI~_WmX#q(jgvB%kiixsB5)UpwwXAIc%Nm_b+hIB|OL?6Z8!vJ+ z(S!N!qVdGh#GCQtf&W00C-G?f2RwN5eN(DU^XvPT_pPs|iPMDS%5o&YKG(aCK1iQ< zj>0)V$@qt#@1^oeE~Bo_ZmV-Uj~;B8fxz1Ug|*;IZUBA+A990`3{4LXvD;*rfr|U4 zr8}CfQ_Qi|7-w++&@j1S++hRX<1UQZSvGr#U6v52cn(LS<%Au@<$ed6kyP3Q~)$in_wD zqYD^UAi7s44&_!2%5Zehk5SQ2jw%XnOY7vc(6fUn}Ju!@Q-q&n|%n=mQ&z_ZR# zZV%pf9wcZit9!>TIo#gxT%sBJ8Bb;1F6!m7=@_gco5DmSd98psWrJwdidt)Vl0BwN ziKGLg7oZUtXEEPJh6N23ML3VFaR&5B-oz3*21yY0IYb_@hFC|$|Dg*{1^6uOu+k^x ze)uIRa6hVW705+hX$#Feh+V&ri|B&*n29Nl2r%1q71HuJcL1(zuq}U+pU>b15`$!9XySzCrz%Fs=}3%_l}vTW6M%evKCUeA9SH>1^>PKm;I zrH?KDx02k#xzNaAqdh3vU3N=Mz?5wrDWa3o&`WHXZ6)p3>Yl6Gn!^^+R^k=Se~m}6 z>UOnBJDe)<{|&ft3SRskYxrY)de?uDe9a@opI6W_ sYaR_dWsMN6*{aN7S4ahMfH=emVclt%#C3>;DjVS-TtpRd!j}5vFY3lC1ONa4 diff --git a/dsLightRag/Util/__pycache__/PostgreSQLUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/PostgreSQLUtil.cpython-310.pyc index d2b6d3e8586bee33674d56d0c2b637599005cb4a..4e49862db4da18e1a6bf5acd1c4c8757d61962b8 100644 GIT binary patch delta 49 zcmbQsGna=upO=@50SFoy3pa8LvoJt3vt|JR DI1dbu From 3af27fe9ff64cfc6bb28f6ced5fdd31e32b98801 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 09:25:51 +0800 Subject: [PATCH 12/17] 'commit' --- dsLightRag/WxGzh/T2_CollectArticle.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index 0338bae8..6892c371 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -161,6 +161,7 @@ if __name__ == '__main__': # 爬取文章 for item in gzlist: + cnt = 0 account_name = item["account_name"] account_id = item["account_id"] id = item["id"] @@ -205,6 +206,7 @@ if __name__ == '__main__': query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) fakeid_list = query_fakeid_response.json().get('app_msg_list') + for item in fakeid_list: article_url = item.get('link') article_title = item.get('title') @@ -225,10 +227,13 @@ if __name__ == '__main__': pool = loop.run_until_complete(init_postgres_pool()) loop.run_until_complete( save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, id)) + cnt = cnt + 1 finally: loop.run_until_complete(pool.close()) loop.close() - + # 休息1秒,防止频繁访问被封 time.sleep(1) + logger.info(f"成功获取公众号: {account_name} {cnt}篇文章。") # 关闭浏览器 driver.quit() + From 6d44aac079a046fc81e21701364ac0509240c965 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 09:44:38 +0800 Subject: [PATCH 13/17] 'commit' --- dsLightRag/WxGzh/T2_CollectArticle.py | 222 +++++++++++++------------- 1 file changed, 112 insertions(+), 110 deletions(-) diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index 6892c371..f1bd6197 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -42,6 +42,11 @@ if not logger.handlers: handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(handler) +# 添加微信请求头 +header = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36', + 'Referer': 'https://mp.weixin.qq.com/' +} async def get_wechat_sources(): """从t_wechat_source表获取微信公众号列表""" @@ -101,139 +106,136 @@ async def save_article_to_db(pool, article_title, account_name, article_url, pub logging.error(f"保存文章失败: {e}") -if __name__ == '__main__': - # 从文件cookies.txt中获取 +async def initialize_wechat_session(): + """初始化微信会话,获取cookies和token""" with open('cookies.txt', 'r', encoding='utf-8') as f: content = f.read() - # 使用json还原为json对象 cookies = json.loads(content) - # 检查是否有过期时间 + global driver # 添加这行 expiry = cookies["expiry"] if expiry: - # 换算出过期时间 - expiry_time = time.localtime(expiry) - expiry_date = time.strftime("%Y-%m-%d %H:%M:%S", expiry_time) - - # 获取当前时间戳 current_timestamp = time.time() - # 检查是否已过期 if current_timestamp > expiry: logger.error("Cookie已过期") exit() - # 移除expiry属性 + del cookies["expiry"] - logger.info(f"cookies的过期时间一般是4天,cookies过期时间:%s" % expiry_date) - options = Options() - options.add_argument('-headless') # 无头参数,调试时可以注释掉 - # 设置headers - header = { - "HOST": "mp.weixin.qq.com", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Encoding": "gzip, deflate, br", - "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4", - "Connection": "keep-alive" - } - service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") - # 使用统一的初始化方式 + options = Options() + options.add_argument('-headless') driver = init_wechat_browser() - # 方法:使用requests库发送请求获取重定向URL + url = 'https://mp.weixin.qq.com' response = requests.get(url=url, allow_redirects=False, cookies=cookies) + if 'Location' in response.headers: redirect_url = response.headers.get("Location") - logger.info(f"重定向URL:%s" % redirect_url) token_match = re.findall(r'token=(\d+)', redirect_url) if token_match: token = token_match[0] - logger.info(f"获取到的token:%s" % token) + return cookies, token + + return None, None + + +async def get_wechat_account_list(cookies, token, account_name): + """获取指定公众号的fakeid""" + search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' + query_id = { + 'action': 'search_biz', + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'query': account_name, + 'begin': '0', + 'count': '5' + } - article_urls = [] + search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) + lists = search_response.json().get('list')[0] + return lists.get('fakeid') + + +async def get_article_list(cookies, token, fakeid): + """获取公众号文章列表""" + appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' + query_id_data = { + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'action': 'list_ex', + 'begin': '0', + 'count': '5', + 'query': '', + 'fakeid': fakeid, + 'type': '9' + } + + query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) + return query_fakeid_response.json().get('app_msg_list') + + +async def process_single_article(article_info, account_info, cookies, token): + """处理单篇文章""" + article_url = article_info.get('link') + article_title = article_info.get('title') + publish_time = datetime.datetime.fromtimestamp(int(article_info.get("update_time"))) + + if '试卷' in article_title: + return False - # 获取公众号列表 - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) try: - gzlist = loop.run_until_complete(get_wechat_sources()) + pool = await init_postgres_pool() + content = get_article_content(article_url) + await save_article_to_db(pool, article_title, account_info["account_name"], + article_url, publish_time, content, account_info["id"]) + return True + except Exception as e: + logger.error(f"处理文章时出错: {e}") + return False finally: - loop.close() + if 'pool' in locals(): + await pool.close() + + +async def process_wechat_account(account_info, cookies, token): + """处理单个公众号的所有文章""" + cnt = 0 + fakeid = await get_wechat_account_list(cookies, token, account_info["account_name"]) + articles = await get_article_list(cookies, token, fakeid) + + for article in articles: + success = await process_single_article(article, account_info, cookies, token) + if success: + cnt += 1 + time.sleep(1) + + logger.info(f"成功获取公众号: {account_info['account_name']} {cnt}篇文章。") + return cnt + + +async def main(): + """主函数""" + cookies, token = await initialize_wechat_session() + if not cookies or not token: + logger.error("初始化微信会话失败") + return + + account_list = await get_wechat_sources() + for account in account_list: + await process_wechat_account(account, cookies, token) - # 爬取文章 - for item in gzlist: - cnt = 0 - account_name = item["account_name"] - account_id = item["account_id"] - id = item["id"] - # 搜索微信公众号的接口地址 - search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' - # 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字 - query_id = { - 'action': 'search_biz', - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'query': account_name, - 'begin': '0', - 'count': '5' - } - # 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers - search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) - # 取搜索结果中的第一个公众号 - lists = search_response.json().get('list')[0] - # 获取这个公众号的fakeid,后面爬取公众号文章需要此字段 - fakeid = lists.get('fakeid') - logging.info("fakeid:" + fakeid) - # 微信公众号文章接口地址 - appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' - # 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random - query_id_data = { - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'action': 'list_ex', - 'begin': '0', # 不同页,此参数变化,变化规则为每页加5 - 'count': '5', - 'query': '', - 'fakeid': fakeid, - 'type': '9' - } - # 打开搜索的微信公众号文章列表页 - query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) - fakeid_list = query_fakeid_response.json().get('app_msg_list') - - - for item in fakeid_list: - article_url = item.get('link') - article_title = item.get('title') - publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))) - - if '试卷' in article_title: # 过滤掉试卷,致知物理中有大量试卷,我做教育资讯的不关心试卷 - continue - - logger.info(f"正在处理文章: {article_title} ({publish_time})") - - logger.info(f"正在获取文章: {article_title}内容...") - content = get_article_content(article_url) - logger.info(f"成功获取文章: {article_title}内容。") - - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - pool = loop.run_until_complete(init_postgres_pool()) - loop.run_until_complete( - save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, id)) - cnt = cnt + 1 - finally: - loop.run_until_complete(pool.close()) - loop.close() - # 休息1秒,防止频繁访问被封 - time.sleep(1) - logger.info(f"成功获取公众号: {account_name} {cnt}篇文章。") - # 关闭浏览器 driver.quit() + +if __name__ == '__main__': + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(main()) + finally: + loop.close() From 54a1ba265f9119375c84c8ce3574f529cb01cfcf Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 09:45:30 +0800 Subject: [PATCH 14/17] 'commit' --- dsLightRag/WxGzh/T2_CollectArticle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index f1bd6197..a72455c0 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -77,7 +77,7 @@ async def is_article_exist(pool, article_url): async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, source_id): # 先检查文章是否已存在 if await is_article_exist(pool, article_url): - logger.info(f"文章已存在,跳过保存: {article_url}") + logger.info(f"文章已存在,跳过保存: {account_name}-{article_title}") return # 在这里调用 lightrag进行知识库构建 workspace = 'ChangChun' From 3fb6bd95b2197ba93c57c2c93d39282535092d05 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 09:59:47 +0800 Subject: [PATCH 15/17] 'commit' --- dsLightRag/WxGzh/T2_CollectArticle.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index a72455c0..7069a9c1 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -75,10 +75,7 @@ async def is_article_exist(pool, article_url): async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, source_id): - # 先检查文章是否已存在 - if await is_article_exist(pool, article_url): - logger.info(f"文章已存在,跳过保存: {account_name}-{article_title}") - return + # 在这里调用 lightrag进行知识库构建 workspace = 'ChangChun' # 使用PG库后,这个是没有用的,但目前的项目代码要求必传,就写一个吧。 @@ -190,6 +187,10 @@ async def process_single_article(article_info, account_info, cookies, token): try: pool = await init_postgres_pool() + # 先检查文章是否已存在 + if await is_article_exist(pool, article_url): + logger.info(f'文章已存在,跳过保存: {account_info["account_name"]}-{article_title}') + return False content = get_article_content(article_url) await save_article_to_db(pool, article_title, account_info["account_name"], article_url, publish_time, content, account_info["id"]) From 3fec770149ea340009d156700099f12b16789c33 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 10:01:21 +0800 Subject: [PATCH 16/17] 'commit' --- dsLightRag/WxGzh/T2_CollectArticle.py | 30 ++++++++++++++++++--------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py index 7069a9c1..051e82dc 100644 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ b/dsLightRag/WxGzh/T2_CollectArticle.py @@ -221,16 +221,26 @@ async def process_wechat_account(account_info, cookies, token): async def main(): """主函数""" - cookies, token = await initialize_wechat_session() - if not cookies or not token: - logger.error("初始化微信会话失败") - return - - account_list = await get_wechat_sources() - for account in account_list: - await process_wechat_account(account, cookies, token) - - driver.quit() + while True: + try: + logger.info("开始执行微信公众号文章采集任务") + cookies, token = await initialize_wechat_session() + if not cookies or not token: + logger.error("初始化微信会话失败") + continue + + account_list = await get_wechat_sources() + for account in account_list: + await process_wechat_account(account, cookies, token) + + logger.info("本次采集任务完成,等待30分钟后再次执行") + await asyncio.sleep(30 * 60) # 30分钟 + except Exception as e: + logger.error(f"主循环发生错误: {e}") + await asyncio.sleep(30 * 60) # 出错后也等待30分钟 + finally: + if 'driver' in globals(): + driver.quit() if __name__ == '__main__': From 1cb0580377e420ce33fc788e0989e22ae4633ea0 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Wed, 16 Jul 2025 10:10:20 +0800 Subject: [PATCH 17/17] 'commit' --- dsLightRag/static/ChangChun.html | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dsLightRag/static/ChangChun.html b/dsLightRag/static/ChangChun.html index b71919b8..7b628e3a 100644 --- a/dsLightRag/static/ChangChun.html +++ b/dsLightRag/static/ChangChun.html @@ -217,6 +217,10 @@
力旺实验中学今年的中考成绩怎么样?
+
+ 在730分占比中,哪些学校表现优秀? +
+