|
|
|
@ -56,25 +56,25 @@ async def main():
|
|
|
|
|
embedding_func=embedding_func,
|
|
|
|
|
)
|
|
|
|
|
# 需要注意:注释掉将整理出来的文档内容插入到LightRAG的代码。
|
|
|
|
|
await rag.process_document_complete(
|
|
|
|
|
file_path=file_path,
|
|
|
|
|
output_dir=output_dir,
|
|
|
|
|
parse_method="auto",
|
|
|
|
|
# MinerU特殊参数 - 支持的所有kwargs:
|
|
|
|
|
lang="ch", # 文档语言优化(如:"ch", "en", "ja")
|
|
|
|
|
# device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps"
|
|
|
|
|
# start_page=0, # 起始页码(0为基准,适用于PDF)
|
|
|
|
|
# end_page=10, # 结束页码(0为基准,适用于PDF)
|
|
|
|
|
formula=True, # 启用公式解析
|
|
|
|
|
table=True, # 启用表格解析
|
|
|
|
|
backend="pipeline", # 解析后端:"pipeline", "vlm-transformers"等
|
|
|
|
|
source="local", # 模型源:"huggingface", "modelscope", "local"
|
|
|
|
|
|
|
|
|
|
# RAGAnything标准参数
|
|
|
|
|
display_stats=True, # 显示内容统计信息
|
|
|
|
|
split_by_character=None, # 可选的文本分割字符
|
|
|
|
|
doc_id=None, # 可选的文档ID
|
|
|
|
|
)
|
|
|
|
|
# await rag.process_document_complete(
|
|
|
|
|
# file_path=file_path,
|
|
|
|
|
# output_dir=output_dir,
|
|
|
|
|
# parse_method="auto",
|
|
|
|
|
# # MinerU特殊参数 - 支持的所有kwargs:
|
|
|
|
|
# lang="ch", # 文档语言优化(如:"ch", "en", "ja")
|
|
|
|
|
# # device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps"
|
|
|
|
|
# # start_page=0, # 起始页码(0为基准,适用于PDF)
|
|
|
|
|
# # end_page=10, # 结束页码(0为基准,适用于PDF)
|
|
|
|
|
# formula=True, # 启用公式解析
|
|
|
|
|
# table=True, # 启用表格解析
|
|
|
|
|
# backend="pipeline", # 解析后端:"pipeline", "vlm-transformers"等
|
|
|
|
|
# source="local", # 模型源:"huggingface", "modelscope", "local"
|
|
|
|
|
#
|
|
|
|
|
# # RAGAnything标准参数
|
|
|
|
|
# display_stats=True, # 显示内容统计信息
|
|
|
|
|
# split_by_character=None, # 可选的文本分割字符
|
|
|
|
|
# doc_id=None, # 可选的文档ID
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
修正一下MinerU生成的Latex中,如果是数字加圆圈的样式 \textcircled{1},
|
|
|
|
@ -91,14 +91,12 @@ async def main():
|
|
|
|
|
|
|
|
|
|
# 按 【题型】 分隔开
|
|
|
|
|
content = content.replace("\n\n", "\n")
|
|
|
|
|
questions = content.split('【题型】')
|
|
|
|
|
# 从头开始找,找到第一个【题型】
|
|
|
|
|
content = content[content.find('【题型】'):]
|
|
|
|
|
|
|
|
|
|
questions = content.split('【题型】')
|
|
|
|
|
idx = 0
|
|
|
|
|
for q in questions:
|
|
|
|
|
# 干掉 【题型】前面的文档标题,比如: # 《动能定理》巩固练习
|
|
|
|
|
if idx == 0 and q != "":
|
|
|
|
|
idx = idx + 1
|
|
|
|
|
continue
|
|
|
|
|
# 干掉空行
|
|
|
|
|
if q.strip() == "" or q == '\n':
|
|
|
|
|
continue
|
|
|
|
@ -117,6 +115,7 @@ async def main():
|
|
|
|
|
with open(path + r'/测试.md', 'w', encoding='utf-8') as f:
|
|
|
|
|
f.write(formatted_content)
|
|
|
|
|
# 将path目录下的images目录,整体拷贝到 output下
|
|
|
|
|
if os.path.exists(output_dir + r'/images'):
|
|
|
|
|
shutil.rmtree(output_dir + r'/images')
|
|
|
|
|
shutil.copytree(path + r'/images', output_dir + r'/images')
|
|
|
|
|
# 删除path目录下
|
|
|
|
|