Files
dsProject/dsLightRag/T1_Train.py
2025-08-14 15:45:08 +08:00

70 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import logging
import os
from Util.DocxUtil import get_docx_content_by_pandoc
from Util.LightRagUtil import initialize_rag
# 是不是清空重新生成
IS_CLEAR = False
# 更详细地控制日志输出
logger = logging.getLogger('lightrag')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
async def main():
# 清空文件
if IS_CLEAR:
# 注释掉或删除以下清理代码
files_to_delete = [
"graph_chunk_entity_relation.graphml",
"kv_store_doc_status.json",
"kv_store_full_docs.json",
"kv_store_text_chunks.json",
"vdb_chunks.json",
"vdb_entities.json",
"vdb_relationships.json",
]
# 删除文件
for file in files_to_delete:
file_path = os.path.join(WORKING_DIR, file)
if os.path.exists(file_path):
os.remove(file_path)
logger.info(f"删除的文件:: {file_path}")
try:
# 注意默认设置使用NetworkX
rag = await initialize_rag(WORKING_DIR)
# 在docx_file 目录下遍历所有以KEMU开头的文件
for filename in os.listdir(docx_file):
if filename.startswith(KEMU):
file_path = os.path.join(docx_file, filename)
# 获取docx文件的内容
if filename.endswith(".docx"):
continue
# content = get_docx_content_by_pandoc(file_path)
elif filename.endswith(".txt"):
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
await rag.ainsert(content, file_paths=[filename])
logger.info(f"Inserted content from {filename}")
except Exception as e:
logger.error(f"An error occurred: {e}")
finally:
await rag.finalize_storages()
# KEMUS=['JiHe','Math','SuShi','Chemistry','ShiJi','ChangChun']
KEMUS = ['Math2']
for KEMU in KEMUS:
# 组装文件路径
WORKING_DIR = "./Topic/" + KEMU
docx_file = 'static/Txt/'
asyncio.run(main())