You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

69 lines
2.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import asyncio
import logging
from Util.DocxUtil import get_docx_content_by_pandoc
from Util.LightRagUtil import initialize_pg_rag
# 或者如果你想更详细地控制日志输出
logger = logging.getLogger('lightrag')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
# 使用PG库后这个是没有用的,但目前的项目代码要求必传,就写一个吧。
WORKING_DIR = f"./output"
#### 下面两个要注意写清楚内容 ####
# 1、工作空间【知识库名称】
# 2、文档名称【不允许出现重复因为后面需要以此为条件查询】
tasks = [
# { # 苏轼
# "workspace": "SuShi", "docx_name": "苏轼.docx",
# },
# { # 化学
# "workspace": "Chemistry", "docx_name": "Chemistry.docx",
# },
#{ # 数学
# "workspace": "Math", "docx_name": "Math.docx",
#},
{ # 几何
"workspace": "JiHe", "docx_name": "JiHe.docx",
},
# { # 史记
# "workspace": "ShiJi", "docx_name": "少年读史记张嘉骅.docx",
# },
# { # 长春市一批次高中学校介绍
# "workspace": "ChangChun", "docx_name": "长春市一批次高中学校介绍.docx",
# },
# { # 2024长春43所高中录取分数线
# "workspace": "ChangChun", "docx_name": "2024长春43所高中录取分数线.docx",
# },
# { # 长春市2025年中考各批次录取最低控制线
# "workspace": "ChangChun", "docx_name": "长春市2025年中考各批次录取最低控制线.docx",
# }
]
for task in tasks:
task["docx_path"] = "./static/Txt/" + task["docx_name"] # 3、文档路径 python是按引用传递的&
async def main():
for task in tasks:
workspace = task["workspace"]
docx_name = task["docx_name"]
docx_path = task["docx_path"]
logger.info(f"开始处理文档: {docx_name}" + ",共%s个文档,当前是第%s个。", len(tasks), tasks.index(task) + 1)
try:
rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace)
# 获取docx文件的内容
content = get_docx_content_by_pandoc(docx_path)
await rag.ainsert(input=content, file_paths=[docx_name]) # 添加来源参数
finally:
if rag:
await rag.finalize_storages()
if __name__ == "__main__":
asyncio.run(main())