diff --git a/dsLightRag/ShiTi/T3_DocxToMd.py b/dsLightRag/ShiTi/T3_DocxToMd.py index 1a04227d..1da4c67d 100644 --- a/dsLightRag/ShiTi/T3_DocxToMd.py +++ b/dsLightRag/ShiTi/T3_DocxToMd.py @@ -56,25 +56,25 @@ async def main(): embedding_func=embedding_func, ) # 需要注意:注释掉将整理出来的文档内容插入到LightRAG的代码。 - await rag.process_document_complete( - file_path=file_path, - output_dir=output_dir, - parse_method="auto", - # MinerU特殊参数 - 支持的所有kwargs: - lang="ch", # 文档语言优化(如:"ch", "en", "ja") - # device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps" - # start_page=0, # 起始页码(0为基准,适用于PDF) - # end_page=10, # 结束页码(0为基准,适用于PDF) - formula=True, # 启用公式解析 - table=True, # 启用表格解析 - backend="pipeline", # 解析后端:"pipeline", "vlm-transformers"等 - source="local", # 模型源:"huggingface", "modelscope", "local" - - # RAGAnything标准参数 - display_stats=True, # 显示内容统计信息 - split_by_character=None, # 可选的文本分割字符 - doc_id=None, # 可选的文档ID - ) + # await rag.process_document_complete( + # file_path=file_path, + # output_dir=output_dir, + # parse_method="auto", + # # MinerU特殊参数 - 支持的所有kwargs: + # lang="ch", # 文档语言优化(如:"ch", "en", "ja") + # # device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps" + # # start_page=0, # 起始页码(0为基准,适用于PDF) + # # end_page=10, # 结束页码(0为基准,适用于PDF) + # formula=True, # 启用公式解析 + # table=True, # 启用表格解析 + # backend="pipeline", # 解析后端:"pipeline", "vlm-transformers"等 + # source="local", # 模型源:"huggingface", "modelscope", "local" + # + # # RAGAnything标准参数 + # display_stats=True, # 显示内容统计信息 + # split_by_character=None, # 可选的文本分割字符 + # doc_id=None, # 可选的文档ID + # ) """ 修正一下MinerU生成的Latex中,如果是数字加圆圈的样式 \textcircled{1}, @@ -91,14 +91,12 @@ async def main(): # 按 【题型】 分隔开 content = content.replace("\n\n", "\n") - questions = content.split('【题型】') + # 从头开始找,找到第一个【题型】 + content = content[content.find('【题型】'):] + questions = content.split('【题型】') idx = 0 for q in questions: - # 干掉 【题型】前面的文档标题,比如: # 《动能定理》巩固练习 - if idx == 0 and q != "": - idx = idx + 1 - continue # 干掉空行 if q.strip() == "" or q == '\n': continue @@ -117,7 +115,8 @@ async def main(): with open(path + r'/测试.md', 'w', encoding='utf-8') as f: f.write(formatted_content) # 将path目录下的images目录,整体拷贝到 output下 - shutil.rmtree(output_dir + r'/images') + if os.path.exists(output_dir + r'/images'): + shutil.rmtree(output_dir + r'/images') shutil.copytree(path + r'/images', output_dir + r'/images') # 删除path目录下 # shutil.rmtree(path) diff --git a/dsLightRag/output/images/3c8ac458bd79159775b913e34a837d0e9c91577fe0a74b8ae68835b0ded1d414.jpg b/dsLightRag/output/images/3c8ac458bd79159775b913e34a837d0e9c91577fe0a74b8ae68835b0ded1d414.jpg new file mode 100644 index 00000000..7f2e807c Binary files /dev/null and b/dsLightRag/output/images/3c8ac458bd79159775b913e34a837d0e9c91577fe0a74b8ae68835b0ded1d414.jpg differ diff --git a/dsLightRag/output/images/5a12714ef5828c67f0701aefae5809ef5ad7f3026cfb57a9e3d4d64f54b9be05.jpg b/dsLightRag/output/images/5a12714ef5828c67f0701aefae5809ef5ad7f3026cfb57a9e3d4d64f54b9be05.jpg new file mode 100644 index 00000000..8041435a Binary files /dev/null and b/dsLightRag/output/images/5a12714ef5828c67f0701aefae5809ef5ad7f3026cfb57a9e3d4d64f54b9be05.jpg differ diff --git a/dsLightRag/output/images/7e9856155509f95de8dd3fa506f755e02a7e1eda34f34a3619585680dc6766eb.jpg b/dsLightRag/output/images/7e9856155509f95de8dd3fa506f755e02a7e1eda34f34a3619585680dc6766eb.jpg new file mode 100644 index 00000000..0e0b8995 Binary files /dev/null and b/dsLightRag/output/images/7e9856155509f95de8dd3fa506f755e02a7e1eda34f34a3619585680dc6766eb.jpg differ diff --git a/dsLightRag/output/images/8c28b5a46563ff2020d41cce8b32fb69eb424f3618432d20fad2d63f0d2fedc8.jpg b/dsLightRag/output/images/8c28b5a46563ff2020d41cce8b32fb69eb424f3618432d20fad2d63f0d2fedc8.jpg new file mode 100644 index 00000000..c13f8a0c Binary files /dev/null and b/dsLightRag/output/images/8c28b5a46563ff2020d41cce8b32fb69eb424f3618432d20fad2d63f0d2fedc8.jpg differ diff --git a/dsLightRag/output/images/c8b3db3caf82268ada8444b624155fee10ffc24e897a05987301b22effd94d92.jpg b/dsLightRag/output/images/c8b3db3caf82268ada8444b624155fee10ffc24e897a05987301b22effd94d92.jpg new file mode 100644 index 00000000..afa10948 Binary files /dev/null and b/dsLightRag/output/images/c8b3db3caf82268ada8444b624155fee10ffc24e897a05987301b22effd94d92.jpg differ