You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

93 lines
3.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import asyncio
import os
import shutil
from raganything import RAGAnything, RAGAnythingConfig
from Util.RagUtil import create_llm_model_func, create_vision_model_func, create_embedding_func
import logging
# 在程序开始时添加以下配置
logging.basicConfig(
level=logging.INFO, # 设置日志级别为INFO
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# 或者如果你想更详细地控制日志输出
logger = logging.getLogger('lightrag')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
async def main():
# 要处理的文件路径
file_path = "Docx/《动能定理》巩固练习.docx"
WORKING_DIR = "../../Topic/MathOcr"
# 删除output目录下的所有文件
output_dir = "../../output"
shutil.rmtree(output_dir, ignore_errors=True)
os.makedirs(output_dir, exist_ok=True)
# 删除WORKING_DIR下的所有文件
shutil.rmtree(WORKING_DIR, ignore_errors=True)
os.makedirs(WORKING_DIR, exist_ok=True)
# 指定最终的索引生成目录,启动索引生成
config = RAGAnythingConfig(
working_dir=WORKING_DIR,
mineru_parse_method="auto",
enable_image_processing=True, # 处理图片
enable_table_processing=True, # 处理表格
enable_equation_processing=True, # 处理公式
)
# 自定义的大模型函数
llm_model_func = create_llm_model_func()
# 自定义的可视模型函数
vision_model_func = create_vision_model_func(llm_model_func)
# 自定义的嵌入函数
embedding_func = create_embedding_func()
rag = RAGAnything(
config=config,
llm_model_func=llm_model_func,
vision_model_func=vision_model_func,
embedding_func=embedding_func,
)
await rag.process_document_complete(
file_path=file_path,
output_dir=output_dir,
parse_method="auto",
# MinerU特殊参数 - 支持的所有kwargs
lang="ch", # 文档语言优化(如:"ch", "en", "ja"
# device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps"
# start_page=0, # 起始页码0为基准适用于PDF
# end_page=10, # 结束页码0为基准适用于PDF
formula=True, # 启用公式解析
table=True, # 启用表格解析
backend="pipeline", # 解析后端:"pipeline", "vlm-transformers"等
source="local", # 模型源:"huggingface", "modelscope", "local"
# RAGAnything标准参数
display_stats=True, # 显示内容统计信息
split_by_character=None, # 可选的文本分割字符
doc_id=None, # 可选的文档ID
)
"""
修正一下MinerU生成的Latex中如果是数字加圆圈的样式 \textcircled{1}无法在Typora或者PyCharm中显示的问题,改成兼容性更强的 \enclose{circle}{1}
"""
finalName = r'D:\dsWork\dsProject\dsRagAnything\output\《动能定理》巩固练习\auto\《动能定理》巩固练习.md'
with open(finalName, 'r', encoding='utf-8') as f:
content = f.read()
content = content.replace(r'\textcircled', r'\enclose{circle}')
with open(finalName, 'w', encoding='utf-8') as f:
f.write(content)
if __name__ == "__main__":
asyncio.run(main())