From f639b4c9fc23720f155bc9f8e42612110f4a1b4c Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 13:02:12 +0800 Subject: [PATCH] 'commit' --- .../__pycache__/LightRagUtil.cpython-310.pyc | Bin 4511 -> 4511 bytes dsLightRag/WxGzh/T3_TrainIntoKG.py | 77 ++++++++---------- 2 files changed, 36 insertions(+), 41 deletions(-) diff --git a/dsLightRag/Util/__pycache__/LightRagUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/LightRagUtil.cpython-310.pyc index 9bc1e54dac9dbe973c4131d5b80ceb60cf4263d8..d89e8018a6fc1ad3abe25bb0951aa47b9d96253a 100644 GIT binary patch delta 1003 zcmYL|OK%cU6o8om!<1nL+NOz!u@8bu)x4E9o>n_(D7h$WO7hITq_bhOySei$XejArjybhntdEvi5s3NLKgefLk!45zRc@VyMSJuHI(n!F_0xzM#4KVC5&DN7JSEvD zaT7`(lOpV=D6^S>%x$o2z!FnvUPLT8eH;gF3(FwJIm)xyo>MF#jc^5KNfns#SLx|h zG!ikFO5ZtYkM!nJ39FR1Rl8B|=;T}~g`IU|_EH|W;&g#+6S-P76vHZO)%NiE{>wNS zwZ3nb2>T|7Vfeq~SOMRK_;;I~ha$aYs{{h3?C5CLiQLpm@R%JXiKq?RQVi9EL3EUO zMRm6EQ7|@%1-0Lh$5Rfmiom3F4dDjF-7T?@T}v-!Qc7ZRjVfr!J8$+QxRg-Qr7M$( zbMU2Ml%g~aKhP_zG)(?-$uhU~3dOZ*dCi7V+i!K5HR}8uaN>m7p=oINLui6H3+b-g zlsaqTsp@sRcq@NjFhN5O)#x5cd%(4E4cZz`zz% delta 1003 zcmYL|$xjne7{EJBI~_WmX#q(jgvB%kiixsB5)UpwwXAIc%Nm_b+hIB|OL?6Z8!vJ+ z(S!N!qVdGh#GCQtf&W00C-G?f2RwN5eN(DU^XvPT_pPs|iPMDS%5o&YKG(aCK1iQ< zj>0)V$@qt#@1^oeE~Bo_ZmV-Uj~;B8fxz1Ug|*;IZUBA+A990`3{4LXvD;*rfr|U4 zr8}CfQ_Qi|7-w++&@j1S++hRX<1UQZSvGr#U6v52cn(LS<%Au@<$ed6kyP3Q~)$in_wD zqYD^UAi7s44&_!2%5Zehk5SQ2jw%XnOY7vc(6fUn}Ju!@Q-q&n|%n=mQ&z_ZR# zZV%pf9wcZit9!>TIo#gxT%sBJ8Bb;1F6!m7=@_gco5DmSd98psWrJwdidt)Vl0BwN ziKGLg7oZUtXEEPJh6N23ML3VFaR&5B-oz3*21yY0IYb_@hFC|$|Dg*{1^6uOu+k^x ze)uIRa6hVW705+hX$#Feh+V&ri|B&*n29Nl2r%1q71HuJcL1(zuq}U+pU>b15`$!9XySzCrz%Fs=}3%_l}vTW6M%evKCUeA9SH>1^>PKm;I zrH?KDx02k#xzNaAqdh3vU3N=Mz?5wrDWa3o&`WHXZ6)p3>Yl6Gn!^^+R^k=Se~m}6 z>UOnBJDe)<{|&ft3SRskYxrY)de?uDe9a@opI6W_ sYaR_dWsMN6*{aN7S4ahMfH=emVclt%#C3>;DjVS-TtpRd!j}5vFY3lC1ONa4 diff --git a/dsLightRag/WxGzh/T3_TrainIntoKG.py b/dsLightRag/WxGzh/T3_TrainIntoKG.py index c7de823c..86473413 100644 --- a/dsLightRag/WxGzh/T3_TrainIntoKG.py +++ b/dsLightRag/WxGzh/T3_TrainIntoKG.py @@ -3,7 +3,7 @@ import logging from Util.DocxUtil import get_docx_content_by_pandoc from Util.LightRagUtil import initialize_pg_rag - +from Util.PostgreSQLUtil import init_postgres_pool logger = logging.getLogger('lightrag') logger.setLevel(logging.INFO) @@ -15,54 +15,49 @@ logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) # 使用PG库后,这个是没有用的,但目前的项目代码要求必传,就写一个吧。 WORKING_DIR = f"./output" -#### 下面两个要注意写清楚内容 #### -# 1、工作空间【知识库名称】 -# 2、文档名称【不允许出现重复,因为后面需要以此为条件查询】 -tasks = [ - # { # 苏轼 - # "workspace": "SuShi", "docx_name": "苏轼.docx", - # }, - # { # 化学 - # "workspace": "Chemistry", "docx_name": "Chemistry.docx", - # }, - # { # 几何 - # "workspace": "JiHe", "docx_name": "JiHe.docx", - # }, - # { # 数学 - # "workspace": "Math", "docx_name": "Math.docx", - # }, - # { # 史记 - # "workspace": "ShiJi", "docx_name": "少年读史记张嘉骅.docx", - # }, - # { # 长春市一批次高中学校介绍 - # "workspace": "ChangChun", "docx_name": "长春市一批次高中学校介绍.docx", - # }, - # { # 2024长春43所高中录取分数线 - # "workspace": "ChangChun", "docx_name": "2024长春43所高中录取分数线.docx", - # }, - { # 长春市2025年中考各批次录取最低控制线 - "workspace": "ChangChun", "docx_name": "长春市2025年中考各批次录取最低控制线.docx", - } -] -for task in tasks: - task["docx_path"] = "./static/Txt/" + task["docx_name"] # 3、文档路径 python是按引用传递的& +async def get_unprocessed_articles(): + """从t_wechat_articles表获取未处理的文章""" + try: + pool = await init_postgres_pool() + async with pool.acquire() as conn: + rows = await conn.fetch(''' + SELECT id, source, title, content + FROM t_wechat_articles + WHERE is_finish = 0 + ''') + return [dict(row) for row in rows] + finally: + await pool.close() async def main(): - for task in tasks: - workspace = task["workspace"] - docx_name = task["docx_name"] - docx_path = task["docx_path"] - logger.info(f"开始处理文档: {docx_name}" + ",共%s个文档,当前是第%s个。", len(tasks), tasks.index(task) + 1) + # 获取未处理的文章 + articles = await get_unprocessed_articles() + logger.info(f"共获取到{len(articles)}篇未处理的文章") + + for article in articles: + workspace = 'ChangChun' + docx_name = f"{article['source']}_{article['title']}" # 组合来源和标题作为文档名 + content = article["content"] # 使用文章内容 + + logger.info(f"开始处理文档: {docx_name}") try: rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace) - # 获取docx文件的内容 - content = get_docx_content_by_pandoc(docx_path) - await rag.ainsert(input=content, file_paths=[docx_name]) # 添加来源参数 + await rag.ainsert(input=content, file_paths=[docx_name]) + + # 标记为已处理 + pool = await init_postgres_pool() + async with pool.acquire() as conn: + await conn.execute(''' + UPDATE t_wechat_articles + SET is_finish = 1 + WHERE id = $1 + ''', article["id"]) finally: if rag: await rag.finalize_storages() - + if pool: + await pool.close() if __name__ == "__main__": asyncio.run(main())