import asyncio import logging from Util.DocxUtil import get_docx_content_by_pandoc from Util.LightRagUtil import initialize_pg_rag # 或者如果你想更详细地控制日志输出 logger = logging.getLogger('lightrag') logger.setLevel(logging.INFO) handler = logging.StreamHandler() handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(handler) logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) # 使用PG库后,这个是没有用的,但目前的项目代码要求必传,就写一个吧。 WORKING_DIR = f"../output" #### 下面两个要注意写清楚内容 #### # 1、工作空间【知识库名称】 # 2、文档名称【不允许出现重复,因为后面需要以此为条件查询】 tasks = [ # { # 苏轼 # "workspace": "SuShi", "docx_name": "SuShi.docx", # }, # { # 化学 # "workspace": "Chemistry", "docx_name": "Chemistry.docx", # }, #{ # 数学 # "workspace": "Math", "docx_name": "Math.docx", #}, { # 几何 "workspace": "JiHe", "docx_name": "JiHe.docx", }, # { # 史记 # "workspace": "ShiJi", "docx_name": "ShiJi.docx", # }, # { # 长春市一批次高中学校介绍 # "workspace": "ChangChun", "docx_name": "ChangChun_3.docx", # }, # { # 2024长春43所高中录取分数线 # "workspace": "ChangChun", "docx_name": "ChangChun_1.docx", # }, # { # 长春市2025年中考各批次录取最低控制线 # "workspace": "ChangChun", "docx_name": "ChangChun_2.docx", # } ] for task in tasks: task["docx_path"] = "./static/Txt/" + task["docx_name"] # 3、文档路径 python是按引用传递的& async def main(): for task in tasks: workspace = task["workspace"] docx_name = task["docx_name"] docx_path = task["docx_path"] logger.info(f"开始处理文档: {docx_name}" + ",共%s个文档,当前是第%s个。", len(tasks), tasks.index(task) + 1) try: rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace) # 获取docx文件的内容 content = get_docx_content_by_pandoc(docx_path) await rag.ainsert(input=content, file_paths=[docx_name]) # 添加来源参数 finally: if rag: await rag.finalize_storages() if __name__ == "__main__": asyncio.run(main())