|
|
import asyncio
|
|
|
import os
|
|
|
import shutil
|
|
|
|
|
|
from raganything import RAGAnything, RAGAnythingConfig
|
|
|
from Util.RagUtil import create_llm_model_func, create_vision_model_func, create_embedding_func
|
|
|
|
|
|
import logging
|
|
|
|
|
|
# 在程序开始时添加以下配置
|
|
|
logging.basicConfig(
|
|
|
level=logging.INFO, # 设置日志级别为INFO
|
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
|
)
|
|
|
|
|
|
# 或者如果你想更详细地控制日志输出
|
|
|
logger = logging.getLogger('lightrag')
|
|
|
logger.setLevel(logging.INFO)
|
|
|
handler = logging.StreamHandler()
|
|
|
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
|
|
logger.addHandler(handler)
|
|
|
|
|
|
|
|
|
async def main():
|
|
|
# 要处理的文件路径
|
|
|
file_path = "Docx/《动能定理》巩固练习.docx"
|
|
|
WORKING_DIR = "../../Topic/MathOcr"
|
|
|
|
|
|
# 删除output目录下的所有文件
|
|
|
output_dir = "../../output"
|
|
|
shutil.rmtree(output_dir, ignore_errors=True)
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
# 删除WORKING_DIR下的所有文件
|
|
|
shutil.rmtree(WORKING_DIR, ignore_errors=True)
|
|
|
os.makedirs(WORKING_DIR, exist_ok=True)
|
|
|
|
|
|
# 指定最终的索引生成目录,启动索引生成
|
|
|
config = RAGAnythingConfig(
|
|
|
working_dir=WORKING_DIR,
|
|
|
mineru_parse_method="auto",
|
|
|
enable_image_processing=True, # 处理图片
|
|
|
enable_table_processing=True, # 处理表格
|
|
|
enable_equation_processing=True, # 处理公式
|
|
|
)
|
|
|
# 自定义的大模型函数
|
|
|
llm_model_func = create_llm_model_func()
|
|
|
# 自定义的可视模型函数
|
|
|
vision_model_func = create_vision_model_func(llm_model_func)
|
|
|
# 自定义的嵌入函数
|
|
|
embedding_func = create_embedding_func()
|
|
|
rag = RAGAnything(
|
|
|
config=config,
|
|
|
llm_model_func=llm_model_func,
|
|
|
vision_model_func=vision_model_func,
|
|
|
embedding_func=embedding_func,
|
|
|
)
|
|
|
|
|
|
await rag.process_document_complete(
|
|
|
file_path=file_path,
|
|
|
output_dir=output_dir,
|
|
|
parse_method="auto",
|
|
|
# MinerU特殊参数 - 支持的所有kwargs:
|
|
|
lang="ch", # 文档语言优化(如:"ch", "en", "ja")
|
|
|
# device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps"
|
|
|
# start_page=0, # 起始页码(0为基准,适用于PDF)
|
|
|
# end_page=10, # 结束页码(0为基准,适用于PDF)
|
|
|
formula=True, # 启用公式解析
|
|
|
table=True, # 启用表格解析
|
|
|
backend="pipeline", # 解析后端:"pipeline", "vlm-transformers"等
|
|
|
source="local", # 模型源:"huggingface", "modelscope", "local"
|
|
|
|
|
|
# RAGAnything标准参数
|
|
|
display_stats=True, # 显示内容统计信息
|
|
|
split_by_character=None, # 可选的文本分割字符
|
|
|
doc_id=None, # 可选的文档ID
|
|
|
)
|
|
|
|
|
|
"""
|
|
|
修正一下MinerU生成的Latex中,如果是数字加圆圈的样式 \textcircled{1},无法在Typora或者PyCharm中显示的问题,改成兼容性更强的 \enclose{circle}{1}
|
|
|
"""
|
|
|
|
|
|
finalName = r'D:\dsWork\dsProject\dsRagAnything\output\《动能定理》巩固练习\auto\《动能定理》巩固练习.md'
|
|
|
with open(finalName, 'r', encoding='utf-8') as f:
|
|
|
content = f.read()
|
|
|
content = content.replace(r'\textcircled', r'\enclose{circle}')
|
|
|
with open(finalName, 'w', encoding='utf-8') as f:
|
|
|
f.write(content)
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
asyncio.run(main())
|