Files
dsProject/dsLightRag/Backup/PG_T1_Train.py
2025-08-14 15:45:08 +08:00

69 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import logging
from Util.DocxUtil import get_docx_content_by_pandoc
from Util.LightRagUtil import initialize_pg_rag
# 详细地控制日志输出
logger = logging.getLogger('lightrag')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
# 使用PG库后这个是没有用的,但目前的项目代码要求必传,就写一个吧。
WORKING_DIR = f"../output"
#### 下面两个要注意写清楚内容 ####
# 1、工作空间【知识库名称】
# 2、文档名称【不允许出现重复因为后面需要以此为条件查询】
tasks = [
# { # 苏轼
# "workspace": "SuShi", "docx_name": "SuShi.docx",
# },
# { # 化学
# "workspace": "Chemistry", "docx_name": "Chemistry.docx",
# },
# { # 数学
# "workspace": "Math", "docx_name": "Math.docx",
# },
# { # 几何
# "workspace": "JiHe", "docx_name": "JiHe.docx",
# },
# { # 史记
# "workspace": "ShiJi", "docx_name": "ShiJi.docx",
# },
{ # 长春市一批次高中学校介绍
"workspace": "ChangChun", "docx_name": "ChangChun_3.docx",
},
{ # 2024长春43所高中录取分数线
"workspace": "ChangChun", "docx_name": "ChangChun_1.docx",
},
{ # 长春市2025年中考各批次录取最低控制线
"workspace": "ChangChun", "docx_name": "ChangChun_2.docx",
}
]
for task in tasks:
task["docx_path"] = "./static/Txt/" + task["docx_name"] # 3、文档路径 python是按引用传递的&
async def main():
for task in tasks:
workspace = task["workspace"]
docx_name = task["docx_name"]
docx_path = task["docx_path"]
logger.info(f"开始处理文档: {docx_name}" + ",共%s个文档,当前是第%s个。", len(tasks), tasks.index(task) + 1)
try:
rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace)
# 获取docx文件的内容
content = get_docx_content_by_pandoc(docx_path)
await rag.ainsert(input=content, file_paths=[docx_name]) # 添加来源参数
finally:
if rag:
await rag.finalize_storages()
if __name__ == "__main__":
asyncio.run(main())