70 lines
2.3 KiB
Python
70 lines
2.3 KiB
Python
import asyncio
|
||
import logging
|
||
import os
|
||
|
||
from Util.DocxUtil import get_docx_content_by_pandoc
|
||
from Util.LightRagUtil import initialize_rag
|
||
|
||
# 是不是清空重新生成
|
||
IS_CLEAR = False
|
||
|
||
# 更详细地控制日志输出
|
||
logger = logging.getLogger('lightrag')
|
||
logger.setLevel(logging.INFO)
|
||
handler = logging.StreamHandler()
|
||
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
||
logger.addHandler(handler)
|
||
|
||
|
||
async def main():
|
||
# 清空文件
|
||
if IS_CLEAR:
|
||
# 注释掉或删除以下清理代码
|
||
files_to_delete = [
|
||
"graph_chunk_entity_relation.graphml",
|
||
"kv_store_doc_status.json",
|
||
"kv_store_full_docs.json",
|
||
"kv_store_text_chunks.json",
|
||
"vdb_chunks.json",
|
||
"vdb_entities.json",
|
||
"vdb_relationships.json",
|
||
]
|
||
|
||
# 删除文件
|
||
for file in files_to_delete:
|
||
file_path = os.path.join(WORKING_DIR, file)
|
||
if os.path.exists(file_path):
|
||
os.remove(file_path)
|
||
logger.info(f"删除的文件:: {file_path}")
|
||
|
||
try:
|
||
# 注意:默认设置使用NetworkX
|
||
rag = await initialize_rag(WORKING_DIR)
|
||
# 在docx_file 目录下遍历所有以KEMU开头的文件
|
||
for filename in os.listdir(docx_file):
|
||
if filename.startswith(KEMU):
|
||
file_path = os.path.join(docx_file, filename)
|
||
# 获取docx文件的内容
|
||
if filename.endswith(".docx"):
|
||
continue
|
||
# content = get_docx_content_by_pandoc(file_path)
|
||
elif filename.endswith(".txt"):
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
content = f.read()
|
||
await rag.ainsert(content, file_paths=[filename])
|
||
logger.info(f"Inserted content from {filename}")
|
||
except Exception as e:
|
||
logger.error(f"An error occurred: {e}")
|
||
finally:
|
||
await rag.finalize_storages()
|
||
|
||
|
||
# KEMUS=['JiHe','Math','SuShi','Chemistry','ShiJi','ChangChun']
|
||
KEMUS = ['Math2']
|
||
|
||
for KEMU in KEMUS:
|
||
# 组装文件路径
|
||
WORKING_DIR = "./Topic/" + KEMU
|
||
docx_file = 'static/Txt/'
|
||
asyncio.run(main())
|