'commit'

2 weeks ago · a72a1a67cc
parent fbc6939207
commit a72a1a67cc
6 changed files with 24 additions and 25 deletions
--- a/dsLightRag/ShiTi/T3_DocxToMd.py
+++ b/dsLightRag/ShiTi/T3_DocxToMd.py
@ -56,25 +56,25 @@ async def main():
        embedding_func=embedding_func,
    )
    #  需要注意：注释掉将整理出来的文档内容插入到LightRAG的代码。
-    await rag.process_document_complete(
-        file_path=file_path,
-        output_dir=output_dir,
-        parse_method="auto",
-        # MinerU特殊参数 - 支持的所有kwargs：
-        lang="ch",  # 文档语言优化（如："ch", "en", "ja"）
-        # device="cuda:0",  # 推理设备："cpu", "cuda", "cuda:0", "npu", "mps"
-        # start_page=0,  # 起始页码（0为基准，适用于PDF）
-        # end_page=10,  # 结束页码（0为基准，适用于PDF）
-        formula=True,  # 启用公式解析
-        table=True,  # 启用表格解析
-        backend="pipeline",  # 解析后端："pipeline", "vlm-transformers"等
-        source="local",  # 模型源："huggingface", "modelscope", "local"
-
-        # RAGAnything标准参数
-        display_stats=True,  # 显示内容统计信息
-        split_by_character=None,  # 可选的文本分割字符
-        doc_id=None,  # 可选的文档ID
-    )
+    # await rag.process_document_complete(
+    #     file_path=file_path,
+    #     output_dir=output_dir,
+    #     parse_method="auto",
+    #     # MinerU特殊参数 - 支持的所有kwargs：
+    #     lang="ch",  # 文档语言优化（如："ch", "en", "ja"）
+    #     # device="cuda:0",  # 推理设备："cpu", "cuda", "cuda:0", "npu", "mps"
+    #     # start_page=0,  # 起始页码（0为基准，适用于PDF）
+    #     # end_page=10,  # 结束页码（0为基准，适用于PDF）
+    #     formula=True,  # 启用公式解析
+    #     table=True,  # 启用表格解析
+    #     backend="pipeline",  # 解析后端："pipeline", "vlm-transformers"等
+    #     source="local",  # 模型源："huggingface", "modelscope", "local"
+    #
+    #     # RAGAnything标准参数
+    #     display_stats=True,  # 显示内容统计信息
+    #     split_by_character=None,  # 可选的文本分割字符
+    #     doc_id=None,  # 可选的文档ID
+    # )

    """
    修正一下MinerU生成的Latex中，如果是数字加圆圈的样式  \textcircled{1}，
@ -91,14 +91,12 @@ async def main():

        # 按 【题型】 分隔开
        content = content.replace("\n\n", "\n")
-        questions = content.split('【题型】')
+        # 从头开始找，找到第一个【题型】
+        content = content[content.find('【题型】'):]

+        questions = content.split('【题型】')
        idx = 0
        for q in questions:
-            # 干掉 【题型】前面的文档标题，比如: # 《动能定理》巩固练习
-            if idx == 0 and q != "":
-                idx = idx + 1
-                continue
            # 干掉空行
            if q.strip() == "" or q == '\n':
                continue
@ -117,6 +115,7 @@ async def main():
        with open(path + r'/测试.md', 'w', encoding='utf-8') as f:
            f.write(formatted_content)
    # 将path目录下的images目录，整体拷贝到 output下
+    if os.path.exists(output_dir + r'/images'):
        shutil.rmtree(output_dir + r'/images')
    shutil.copytree(path + r'/images', output_dir + r'/images')
    # 删除path目录下
--- a/dsLightRag/output/images/3c8ac458bd79159775b913e34a837d0e9c91577fe0a74b8ae68835b0ded1d414.jpg
+++ b/dsLightRag/output/images/3c8ac458bd79159775b913e34a837d0e9c91577fe0a74b8ae68835b0ded1d414.jpg
--- a/dsLightRag/output/images/5a12714ef5828c67f0701aefae5809ef5ad7f3026cfb57a9e3d4d64f54b9be05.jpg
+++ b/dsLightRag/output/images/5a12714ef5828c67f0701aefae5809ef5ad7f3026cfb57a9e3d4d64f54b9be05.jpg
--- a/dsLightRag/output/images/7e9856155509f95de8dd3fa506f755e02a7e1eda34f34a3619585680dc6766eb.jpg
+++ b/dsLightRag/output/images/7e9856155509f95de8dd3fa506f755e02a7e1eda34f34a3619585680dc6766eb.jpg
--- a/dsLightRag/output/images/8c28b5a46563ff2020d41cce8b32fb69eb424f3618432d20fad2d63f0d2fedc8.jpg
+++ b/dsLightRag/output/images/8c28b5a46563ff2020d41cce8b32fb69eb424f3618432d20fad2d63f0d2fedc8.jpg
--- a/dsLightRag/output/images/c8b3db3caf82268ada8444b624155fee10ffc24e897a05987301b22effd94d92.jpg
+++ b/dsLightRag/output/images/c8b3db3caf82268ada8444b624155fee10ffc24e897a05987301b22effd94d92.jpg