dsProject/dsLightRag/ShiTi/T3_DocxToMd.py

import asyncio
import os
import shutil
from raganything import RAGAnything, RAGAnythingConfig
from Util.LightRagUtil import create_llm_model_func, create_embedding_func, create_vision_model_func, \
    format_exam_content
import logging

# 在程序开始时添加以下配置
logging.basicConfig(
    level=logging.INFO,  # 设置日志级别为INFO
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

# 更详细地控制日志输出
logger = logging.getLogger('lightrag')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)


async def main():
    # 要处理的文件路径
    file_path = "Docx/《动能定理》巩固练习.docx"
    WORKING_DIR = "../Topic/WuLi"
    fileName = file_path.split('/')[-1].replace(".docx", "").replace(".doc", "")

    # 删除output目录下的所有文件
    output_dir = "../output"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    # 删除WORKING_DIR下的所有文件
    shutil.rmtree(WORKING_DIR, ignore_errors=True)
    os.makedirs(WORKING_DIR, exist_ok=True)

    # 指定最终的索引生成目录，启动索引生成
    config = RAGAnythingConfig(
        working_dir=WORKING_DIR,
        mineru_parse_method="auto",
        enable_image_processing=True,  # 处理图片
        enable_table_processing=True,  # 处理表格
        enable_equation_processing=True,  # 处理公式
    )
    # 自定义的大模型函数
    llm_model_func = create_llm_model_func()
    # 自定义的可视模型函数
    vision_model_func = create_vision_model_func(llm_model_func)
    # 自定义的嵌入函数
    embedding_func = create_embedding_func()
    rag = RAGAnything(
        config=config,
        llm_model_func=llm_model_func,
        vision_model_func=vision_model_func,
        embedding_func=embedding_func,
    )
    #  需要注意：注释掉将整理出来的文档内容插入到LightRAG的代码。
    await rag.process_document_complete(
        file_path=file_path,
        output_dir=output_dir,
        parse_method="auto",
        # MinerU特殊参数 - 支持的所有kwargs：
        lang="ch",  # 文档语言优化（如："ch", "en", "ja"）
        # device="cuda:0",  # 推理设备："cpu", "cuda", "cuda:0", "npu", "mps"
        # start_page=0,  # 起始页码（0为基准，适用于PDF）
        # end_page=10,  # 结束页码（0为基准，适用于PDF）
        formula=True,  # 启用公式解析
        table=True,  # 启用表格解析
        backend="pipeline",  # 解析后端："pipeline", "vlm-transformers"等
        source="local",  # 模型源："huggingface", "modelscope", "local"

        # RAGAnything标准参数
        display_stats=True,  # 显示内容统计信息
        split_by_character=None,  # 可选的文本分割字符
        doc_id=None,  # 可选的文档ID
    )

    """
    修正一下MinerU生成的Latex中，如果是数字加圆圈的样式  \textcircled{1}，
    无法在Typora或者PyCharm中显示的问题,改成兼容性更强的 \enclose{circle}{1}
    """
    path = r'../output/' + fileName + '/auto'
    finalName = path + r'/' + fileName + '.md'
    with open(finalName, 'r', encoding='utf-8') as f:
        content = f.read()
        content = content.replace(r'\textcircled', r'\enclose{circle}')
        # 按【题型】分割试题
        question_types = ["不定项选择", "单选题", "多选题", "填空题", "判断题", "完型填空题", "计算题"]
        
        # 分割试题内容
        questions = []
        current_question = ""
        found_first_question = False
        
        for line in content.split('\n'):
            if any(line.startswith(f"【题型】 {t}") for t in question_types):
                if not found_first_question:
                    found_first_question = True
                    current_question = "**" + line + "**\n"  # 加粗处理
                else:
                    if current_question:
                        questions.append(current_question.strip())
                    current_question = "**" + line + "**\n"  # 加粗处理
            elif found_first_question:
                current_question += line + "\n"
        
        if current_question:
            questions.append(current_question.strip())
        
        # 重新组合内容
        formatted_content = "\n\n".join(questions)
        
        with open(path + r'/测试.md' , 'w', encoding='utf-8') as f:
            f.write(formatted_content)
    # 将path目录下的images目录，整体拷贝到 output下
    shutil.rmtree(output_dir + r'/images')
    shutil.copytree(path + r'/images', output_dir + r'/images')
    # 删除path目录下
    #shutil.rmtree(path)


if __name__ == "__main__":
    asyncio.run(main())