main
HuangHai 2 weeks ago
parent fbc6939207
commit a72a1a67cc

@ -56,25 +56,25 @@ async def main():
embedding_func=embedding_func, embedding_func=embedding_func,
) )
# 需要注意注释掉将整理出来的文档内容插入到LightRAG的代码。 # 需要注意注释掉将整理出来的文档内容插入到LightRAG的代码。
await rag.process_document_complete( # await rag.process_document_complete(
file_path=file_path, # file_path=file_path,
output_dir=output_dir, # output_dir=output_dir,
parse_method="auto", # parse_method="auto",
# MinerU特殊参数 - 支持的所有kwargs # # MinerU特殊参数 - 支持的所有kwargs
lang="ch", # 文档语言优化(如:"ch", "en", "ja" # lang="ch", # 文档语言优化(如:"ch", "en", "ja"
# device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps" # # device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps"
# start_page=0, # 起始页码0为基准适用于PDF # # start_page=0, # 起始页码0为基准适用于PDF
# end_page=10, # 结束页码0为基准适用于PDF # # end_page=10, # 结束页码0为基准适用于PDF
formula=True, # 启用公式解析 # formula=True, # 启用公式解析
table=True, # 启用表格解析 # table=True, # 启用表格解析
backend="pipeline", # 解析后端:"pipeline", "vlm-transformers"等 # backend="pipeline", # 解析后端:"pipeline", "vlm-transformers"等
source="local", # 模型源:"huggingface", "modelscope", "local" # source="local", # 模型源:"huggingface", "modelscope", "local"
#
# RAGAnything标准参数 # # RAGAnything标准参数
display_stats=True, # 显示内容统计信息 # display_stats=True, # 显示内容统计信息
split_by_character=None, # 可选的文本分割字符 # split_by_character=None, # 可选的文本分割字符
doc_id=None, # 可选的文档ID # doc_id=None, # 可选的文档ID
) # )
""" """
修正一下MinerU生成的Latex中如果是数字加圆圈的样式 \textcircled{1} 修正一下MinerU生成的Latex中如果是数字加圆圈的样式 \textcircled{1}
@ -91,14 +91,12 @@ async def main():
# 按 【题型】 分隔开 # 按 【题型】 分隔开
content = content.replace("\n\n", "\n") content = content.replace("\n\n", "\n")
questions = content.split('【题型】') # 从头开始找,找到第一个【题型】
content = content[content.find('【题型】'):]
questions = content.split('【题型】')
idx = 0 idx = 0
for q in questions: for q in questions:
# 干掉 【题型】前面的文档标题,比如: # 《动能定理》巩固练习
if idx == 0 and q != "":
idx = idx + 1
continue
# 干掉空行 # 干掉空行
if q.strip() == "" or q == '\n': if q.strip() == "" or q == '\n':
continue continue
@ -117,7 +115,8 @@ async def main():
with open(path + r'/测试.md', 'w', encoding='utf-8') as f: with open(path + r'/测试.md', 'w', encoding='utf-8') as f:
f.write(formatted_content) f.write(formatted_content)
# 将path目录下的images目录整体拷贝到 output下 # 将path目录下的images目录整体拷贝到 output下
shutil.rmtree(output_dir + r'/images') if os.path.exists(output_dir + r'/images'):
shutil.rmtree(output_dir + r'/images')
shutil.copytree(path + r'/images', output_dir + r'/images') shutil.copytree(path + r'/images', output_dir + r'/images')
# 删除path目录下 # 删除path目录下
# shutil.rmtree(path) # shutil.rmtree(path)

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

Loading…
Cancel
Save