'commit'

2 weeks ago · 186639de89
parent a72a1a67cc
commit 186639de89
4 changed files with 60 additions and 127 deletions
--- a/dsLightRag/ShiTi/T1_UploadImage.py
+++ b/dsLightRag/ShiTi/T1_UploadImage.py
--- a/dsLightRag/ShiTi/T2_ImageToMd.py
+++ b/dsLightRag/ShiTi/T2_ImageToMd.py
@ -29,5 +29,5 @@ completion = client.chat.completions.create(
 ocr_text = completion.choices[0].message.content

 # 二、调用格式化函数处理内容
-format_exam_content(raw_text=ocr_text, output_path="../output/数学OCR整理后的结果.md")
+format_exam_content(raw_text=ocr_text, output_path="./output/数学OCR整理后的结果.md")
 print("保存成功！")
--- a/dsLightRag/ST3_DocxToMd.py
+++ b/dsLightRag/ST3_DocxToMd.py
@ -0,0 +1,59 @@
+import asyncio
+
+from Util.DocxUtil import get_docx_content_by_pandoc
+
+
+async def main():
+    # 要处理的文件路径
+    file_path = "ShiTi/Docx/《动能定理》巩固练习.docx"
+
+    get_docx_content_by_pandoc(file_path)
+
+    # """
+    # 修正一下MinerU生成的Latex中，如果是数字加圆圈的样式  \textcircled{1}，
+    # 无法在Typora或者PyCharm中显示的问题,改成兼容性更强的 \enclose{circle}{1}
+    # """
+    # path = r'../output/' + fileName + '/auto'
+    # finalName = path + r'/' + fileName + '.md'
+    # formatted_content = ''
+    # with open(finalName, 'r', encoding='utf-8') as f:
+    #     content = f.read()
+    #     content = content.replace(r'\textcircled', r'\enclose{circle}')
+    #     # 按【题型】分割试题
+    #     question_types = ["不定项选择", "单选题", "多选题", "填空题", "判断题", "完型填空题", "计算题"]
+    #
+    #     # 按 【题型】 分隔开
+    #     content = content.replace("\n\n", "\n")
+    #     # 从头开始找，找到第一个【题型】
+    #     content = content[content.find('【题型】'):]
+    #
+    #     questions = content.split('【题型】')
+    #     idx = 0
+    #     for q in questions:
+    #         # 干掉空行
+    #         if q.strip() == "" or q == '\n':
+    #             continue
+    #         # 如果q是以   question_types 中某个字符开头的，则在完成这个字符串后，换行输出
+    #         for x in question_types:
+    #             if q.startswith(x):
+    #                 q = q.replace(" ", "")
+    #                 # q的x后面第一个字符是不是换行符\n,如果 不是，则添加一个\n
+    #                 if q[q.index(x) + len(x)] != '\n':
+    #                     q = q.replace(x, x + '\n')
+    #                 break
+    #
+    #         q = '【题型】' + q
+    #         formatted_content = formatted_content + q + '\n'
+    #
+    #     with open(path + r'/测试.md', 'w', encoding='utf-8') as f:
+    #         f.write(formatted_content)
+    # # 将path目录下的images目录，整体拷贝到 output下
+    # if os.path.exists(output_dir + r'/images'):
+    #     shutil.rmtree(output_dir + r'/images')
+    # shutil.copytree(path + r'/images', output_dir + r'/images')
+    # # 删除path目录下
+    # # shutil.rmtree(path)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/dsLightRag/ShiTi/T3_DocxToMd.py
+++ b/dsLightRag/ShiTi/T3_DocxToMd.py
@ -1,126 +0,0 @@
-import asyncio
-import os
-import shutil
-from raganything import RAGAnything, RAGAnythingConfig
-from Util.LightRagUtil import create_llm_model_func, create_embedding_func, create_vision_model_func, \
-    format_exam_content
-import logging
-
-# 在程序开始时添加以下配置
-logging.basicConfig(
-    level=logging.INFO,  # 设置日志级别为INFO
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-
-# 更详细地控制日志输出
-logger = logging.getLogger('lightrag')
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
-logger.addHandler(handler)
-
-
-async def main():
-    # 要处理的文件路径
-    file_path = "Docx/《动能定理》巩固练习.docx"
-    WORKING_DIR = "../Topic/WuLi"
-    fileName = file_path.split('/')[-1].replace(".docx", "").replace(".doc", "")
-
-    # 删除output目录下的所有文件
-    output_dir = "../output"
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir, exist_ok=True)
-
-    # 删除WORKING_DIR下的所有文件
-    shutil.rmtree(WORKING_DIR, ignore_errors=True)
-    os.makedirs(WORKING_DIR, exist_ok=True)
-
-    # 指定最终的索引生成目录，启动索引生成
-    config = RAGAnythingConfig(
-        working_dir=WORKING_DIR,
-        mineru_parse_method="auto",
-        enable_image_processing=True,  # 处理图片
-        enable_table_processing=True,  # 处理表格
-        enable_equation_processing=True,  # 处理公式
-    )
-    # 自定义的大模型函数
-    llm_model_func = create_llm_model_func()
-    # 自定义的可视模型函数
-    vision_model_func = create_vision_model_func(llm_model_func)
-    # 自定义的嵌入函数
-    embedding_func = create_embedding_func()
-    rag = RAGAnything(
-        config=config,
-        llm_model_func=llm_model_func,
-        vision_model_func=vision_model_func,
-        embedding_func=embedding_func,
-    )
-    #  需要注意：注释掉将整理出来的文档内容插入到LightRAG的代码。
-    # await rag.process_document_complete(
-    #     file_path=file_path,
-    #     output_dir=output_dir,
-    #     parse_method="auto",
-    #     # MinerU特殊参数 - 支持的所有kwargs：
-    #     lang="ch",  # 文档语言优化（如："ch", "en", "ja"）
-    #     # device="cuda:0",  # 推理设备："cpu", "cuda", "cuda:0", "npu", "mps"
-    #     # start_page=0,  # 起始页码（0为基准，适用于PDF）
-    #     # end_page=10,  # 结束页码（0为基准，适用于PDF）
-    #     formula=True,  # 启用公式解析
-    #     table=True,  # 启用表格解析
-    #     backend="pipeline",  # 解析后端："pipeline", "vlm-transformers"等
-    #     source="local",  # 模型源："huggingface", "modelscope", "local"
-    #
-    #     # RAGAnything标准参数
-    #     display_stats=True,  # 显示内容统计信息
-    #     split_by_character=None,  # 可选的文本分割字符
-    #     doc_id=None,  # 可选的文档ID
-    # )
-
-    """
-    修正一下MinerU生成的Latex中，如果是数字加圆圈的样式  \textcircled{1}，
-    无法在Typora或者PyCharm中显示的问题,改成兼容性更强的 \enclose{circle}{1}
-    """
-    path = r'../output/' + fileName + '/auto'
-    finalName = path + r'/' + fileName + '.md'
-    formatted_content = ''
-    with open(finalName, 'r', encoding='utf-8') as f:
-        content = f.read()
-        content = content.replace(r'\textcircled', r'\enclose{circle}')
-        # 按【题型】分割试题
-        question_types = ["不定项选择", "单选题", "多选题", "填空题", "判断题", "完型填空题", "计算题"]
-
-        # 按 【题型】 分隔开
-        content = content.replace("\n\n", "\n")
-        # 从头开始找，找到第一个【题型】
-        content = content[content.find('【题型】'):]
-
-        questions = content.split('【题型】')
-        idx = 0
-        for q in questions:
-            # 干掉空行
-            if q.strip() == "" or q == '\n':
-                continue
-            # 如果q是以   question_types 中某个字符开头的，则在完成这个字符串后，换行输出
-            for x in question_types:
-                if q.startswith(x):
-                    q = q.replace(" ", "")
-                    # q的x后面第一个字符是不是换行符\n,如果 不是，则添加一个\n
-                    if q[q.index(x) + len(x)] != '\n':
-                        q = q.replace(x, x + '\n')
-                    break
-
-            q = '【题型】' + q
-            formatted_content = formatted_content + q + '\n'
-
-        with open(path + r'/测试.md', 'w', encoding='utf-8') as f:
-            f.write(formatted_content)
-    # 将path目录下的images目录，整体拷贝到 output下
-    if os.path.exists(output_dir + r'/images'):
-        shutil.rmtree(output_dir + r'/images')
-    shutil.copytree(path + r'/images', output_dir + r'/images')
-    # 删除path目录下
-    # shutil.rmtree(path)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())