diff --git a/dsLightRag/ShiTi/T1_UploadImage.py b/dsLightRag/ST1_UploadImage.py similarity index 100% rename from dsLightRag/ShiTi/T1_UploadImage.py rename to dsLightRag/ST1_UploadImage.py diff --git a/dsLightRag/ShiTi/T2_ImageToMd.py b/dsLightRag/ST2_ImageToMd.py similarity index 87% rename from dsLightRag/ShiTi/T2_ImageToMd.py rename to dsLightRag/ST2_ImageToMd.py index 2066762a..9f838ff3 100644 --- a/dsLightRag/ShiTi/T2_ImageToMd.py +++ b/dsLightRag/ST2_ImageToMd.py @@ -29,5 +29,5 @@ completion = client.chat.completions.create( ocr_text = completion.choices[0].message.content # 二、调用格式化函数处理内容 -format_exam_content(raw_text=ocr_text, output_path="../output/数学OCR整理后的结果.md") +format_exam_content(raw_text=ocr_text, output_path="./output/数学OCR整理后的结果.md") print("保存成功!") diff --git a/dsLightRag/ST3_DocxToMd.py b/dsLightRag/ST3_DocxToMd.py new file mode 100644 index 00000000..9556d635 --- /dev/null +++ b/dsLightRag/ST3_DocxToMd.py @@ -0,0 +1,59 @@ +import asyncio + +from Util.DocxUtil import get_docx_content_by_pandoc + + +async def main(): + # 要处理的文件路径 + file_path = "ShiTi/Docx/《动能定理》巩固练习.docx" + + get_docx_content_by_pandoc(file_path) + + # """ + # 修正一下MinerU生成的Latex中,如果是数字加圆圈的样式 \textcircled{1}, + # 无法在Typora或者PyCharm中显示的问题,改成兼容性更强的 \enclose{circle}{1} + # """ + # path = r'../output/' + fileName + '/auto' + # finalName = path + r'/' + fileName + '.md' + # formatted_content = '' + # with open(finalName, 'r', encoding='utf-8') as f: + # content = f.read() + # content = content.replace(r'\textcircled', r'\enclose{circle}') + # # 按【题型】分割试题 + # question_types = ["不定项选择", "单选题", "多选题", "填空题", "判断题", "完型填空题", "计算题"] + # + # # 按 【题型】 分隔开 + # content = content.replace("\n\n", "\n") + # # 从头开始找,找到第一个【题型】 + # content = content[content.find('【题型】'):] + # + # questions = content.split('【题型】') + # idx = 0 + # for q in questions: + # # 干掉空行 + # if q.strip() == "" or q == '\n': + # continue + # # 如果q是以 question_types 中某个字符开头的,则在完成这个字符串后,换行输出 + # for x in question_types: + # if q.startswith(x): + # q = q.replace(" ", "") + # # q的x后面第一个字符是不是换行符\n,如果 不是,则添加一个\n + # if q[q.index(x) + len(x)] != '\n': + # q = q.replace(x, x + '\n') + # break + # + # q = '【题型】' + q + # formatted_content = formatted_content + q + '\n' + # + # with open(path + r'/测试.md', 'w', encoding='utf-8') as f: + # f.write(formatted_content) + # # 将path目录下的images目录,整体拷贝到 output下 + # if os.path.exists(output_dir + r'/images'): + # shutil.rmtree(output_dir + r'/images') + # shutil.copytree(path + r'/images', output_dir + r'/images') + # # 删除path目录下 + # # shutil.rmtree(path) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/dsLightRag/ShiTi/T3_DocxToMd.py b/dsLightRag/ShiTi/T3_DocxToMd.py deleted file mode 100644 index 1da4c67d..00000000 --- a/dsLightRag/ShiTi/T3_DocxToMd.py +++ /dev/null @@ -1,126 +0,0 @@ -import asyncio -import os -import shutil -from raganything import RAGAnything, RAGAnythingConfig -from Util.LightRagUtil import create_llm_model_func, create_embedding_func, create_vision_model_func, \ - format_exam_content -import logging - -# 在程序开始时添加以下配置 -logging.basicConfig( - level=logging.INFO, # 设置日志级别为INFO - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) - -# 更详细地控制日志输出 -logger = logging.getLogger('lightrag') -logger.setLevel(logging.INFO) -handler = logging.StreamHandler() -handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) -logger.addHandler(handler) - - -async def main(): - # 要处理的文件路径 - file_path = "Docx/《动能定理》巩固练习.docx" - WORKING_DIR = "../Topic/WuLi" - fileName = file_path.split('/')[-1].replace(".docx", "").replace(".doc", "") - - # 删除output目录下的所有文件 - output_dir = "../output" - if not os.path.exists(output_dir): - os.makedirs(output_dir, exist_ok=True) - - # 删除WORKING_DIR下的所有文件 - shutil.rmtree(WORKING_DIR, ignore_errors=True) - os.makedirs(WORKING_DIR, exist_ok=True) - - # 指定最终的索引生成目录,启动索引生成 - config = RAGAnythingConfig( - working_dir=WORKING_DIR, - mineru_parse_method="auto", - enable_image_processing=True, # 处理图片 - enable_table_processing=True, # 处理表格 - enable_equation_processing=True, # 处理公式 - ) - # 自定义的大模型函数 - llm_model_func = create_llm_model_func() - # 自定义的可视模型函数 - vision_model_func = create_vision_model_func(llm_model_func) - # 自定义的嵌入函数 - embedding_func = create_embedding_func() - rag = RAGAnything( - config=config, - llm_model_func=llm_model_func, - vision_model_func=vision_model_func, - embedding_func=embedding_func, - ) - # 需要注意:注释掉将整理出来的文档内容插入到LightRAG的代码。 - # await rag.process_document_complete( - # file_path=file_path, - # output_dir=output_dir, - # parse_method="auto", - # # MinerU特殊参数 - 支持的所有kwargs: - # lang="ch", # 文档语言优化(如:"ch", "en", "ja") - # # device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps" - # # start_page=0, # 起始页码(0为基准,适用于PDF) - # # end_page=10, # 结束页码(0为基准,适用于PDF) - # formula=True, # 启用公式解析 - # table=True, # 启用表格解析 - # backend="pipeline", # 解析后端:"pipeline", "vlm-transformers"等 - # source="local", # 模型源:"huggingface", "modelscope", "local" - # - # # RAGAnything标准参数 - # display_stats=True, # 显示内容统计信息 - # split_by_character=None, # 可选的文本分割字符 - # doc_id=None, # 可选的文档ID - # ) - - """ - 修正一下MinerU生成的Latex中,如果是数字加圆圈的样式 \textcircled{1}, - 无法在Typora或者PyCharm中显示的问题,改成兼容性更强的 \enclose{circle}{1} - """ - path = r'../output/' + fileName + '/auto' - finalName = path + r'/' + fileName + '.md' - formatted_content = '' - with open(finalName, 'r', encoding='utf-8') as f: - content = f.read() - content = content.replace(r'\textcircled', r'\enclose{circle}') - # 按【题型】分割试题 - question_types = ["不定项选择", "单选题", "多选题", "填空题", "判断题", "完型填空题", "计算题"] - - # 按 【题型】 分隔开 - content = content.replace("\n\n", "\n") - # 从头开始找,找到第一个【题型】 - content = content[content.find('【题型】'):] - - questions = content.split('【题型】') - idx = 0 - for q in questions: - # 干掉空行 - if q.strip() == "" or q == '\n': - continue - # 如果q是以 question_types 中某个字符开头的,则在完成这个字符串后,换行输出 - for x in question_types: - if q.startswith(x): - q = q.replace(" ", "") - # q的x后面第一个字符是不是换行符\n,如果 不是,则添加一个\n - if q[q.index(x) + len(x)] != '\n': - q = q.replace(x, x + '\n') - break - - q = '【题型】' + q - formatted_content = formatted_content + q + '\n' - - with open(path + r'/测试.md', 'w', encoding='utf-8') as f: - f.write(formatted_content) - # 将path目录下的images目录,整体拷贝到 output下 - if os.path.exists(output_dir + r'/images'): - shutil.rmtree(output_dir + r'/images') - shutil.copytree(path + r'/images', output_dir + r'/images') - # 删除path目录下 - # shutil.rmtree(path) - - -if __name__ == "__main__": - asyncio.run(main())