diff --git a/dsLightRag/Start.py b/dsLightRag/Start.py index 584315f5..a6b09126 100644 --- a/dsLightRag/Start.py +++ b/dsLightRag/Start.py @@ -17,13 +17,8 @@ from starlette.staticfiles import StaticFiles from Util.LightRagUtil import * from Util.PostgreSQLUtil import init_postgres_pool -# 在程序开始时添加以下配置 -logging.basicConfig( - level=logging.INFO, # 设置日志级别为INFO - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -# 或者如果你想更详细地控制日志输出 +# 想更详细地控制日志输出 logger = logging.getLogger('lightrag') logger.setLevel(logging.INFO) handler = logging.StreamHandler() diff --git a/dsLightRag/T1_Train.py b/dsLightRag/T1_Train.py index 1db08183..88080efa 100644 --- a/dsLightRag/T1_Train.py +++ b/dsLightRag/T1_Train.py @@ -4,12 +4,6 @@ import logging from Util.DocxUtil import get_docx_content_by_pandoc from Util.LightRagUtil import initialize_pg_rag -# 在程序开始时添加以下配置 -logging.basicConfig( - level=logging.INFO, # 设置日志级别为INFO - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) - # 或者如果你想更详细地控制日志输出 logger = logging.getLogger('lightrag') logger.setLevel(logging.INFO) diff --git a/dsLightRag/WxGzh/T3_TrainIntoKG.py b/dsLightRag/WxGzh/T3_TrainIntoKG.py new file mode 100644 index 00000000..c7de823c --- /dev/null +++ b/dsLightRag/WxGzh/T3_TrainIntoKG.py @@ -0,0 +1,68 @@ +import asyncio +import logging + +from Util.DocxUtil import get_docx_content_by_pandoc +from Util.LightRagUtil import initialize_pg_rag + + +logger = logging.getLogger('lightrag') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) +logger.addHandler(handler) +logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) + +# 使用PG库后,这个是没有用的,但目前的项目代码要求必传,就写一个吧。 +WORKING_DIR = f"./output" + +#### 下面两个要注意写清楚内容 #### +# 1、工作空间【知识库名称】 +# 2、文档名称【不允许出现重复,因为后面需要以此为条件查询】 +tasks = [ + # { # 苏轼 + # "workspace": "SuShi", "docx_name": "苏轼.docx", + # }, + # { # 化学 + # "workspace": "Chemistry", "docx_name": "Chemistry.docx", + # }, + # { # 几何 + # "workspace": "JiHe", "docx_name": "JiHe.docx", + # }, + # { # 数学 + # "workspace": "Math", "docx_name": "Math.docx", + # }, + # { # 史记 + # "workspace": "ShiJi", "docx_name": "少年读史记张嘉骅.docx", + # }, + # { # 长春市一批次高中学校介绍 + # "workspace": "ChangChun", "docx_name": "长春市一批次高中学校介绍.docx", + # }, + # { # 2024长春43所高中录取分数线 + # "workspace": "ChangChun", "docx_name": "2024长春43所高中录取分数线.docx", + # }, + { # 长春市2025年中考各批次录取最低控制线 + "workspace": "ChangChun", "docx_name": "长春市2025年中考各批次录取最低控制线.docx", + } +] +for task in tasks: + task["docx_path"] = "./static/Txt/" + task["docx_name"] # 3、文档路径 python是按引用传递的& + + +async def main(): + for task in tasks: + workspace = task["workspace"] + docx_name = task["docx_name"] + docx_path = task["docx_path"] + logger.info(f"开始处理文档: {docx_name}" + ",共%s个文档,当前是第%s个。", len(tasks), tasks.index(task) + 1) + try: + rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace) + # 获取docx文件的内容 + content = get_docx_content_by_pandoc(docx_path) + await rag.ainsert(input=content, file_paths=[docx_name]) # 添加来源参数 + finally: + if rag: + await rag.finalize_storages() + + +if __name__ == "__main__": + asyncio.run(main())