You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

128 lines
5.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import asyncio
import os
import shutil
from raganything import RAGAnything, RAGAnythingConfig
from Util.LightRagUtil import create_llm_model_func, create_embedding_func, create_vision_model_func, \
format_exam_content
import logging
# 在程序开始时添加以下配置
logging.basicConfig(
level=logging.INFO, # 设置日志级别为INFO
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# 更详细地控制日志输出
logger = logging.getLogger('lightrag')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
async def main():
# 要处理的文件路径
file_path = "Docx/《动能定理》巩固练习.docx"
WORKING_DIR = "../Topic/WuLi"
fileName = file_path.split('/')[-1].replace(".docx", "").replace(".doc", "")
# 删除output目录下的所有文件
output_dir = "../output"
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# 删除WORKING_DIR下的所有文件
shutil.rmtree(WORKING_DIR, ignore_errors=True)
os.makedirs(WORKING_DIR, exist_ok=True)
# 指定最终的索引生成目录,启动索引生成
config = RAGAnythingConfig(
working_dir=WORKING_DIR,
mineru_parse_method="auto",
enable_image_processing=True, # 处理图片
enable_table_processing=True, # 处理表格
enable_equation_processing=True, # 处理公式
)
# 自定义的大模型函数
llm_model_func = create_llm_model_func()
# 自定义的可视模型函数
vision_model_func = create_vision_model_func(llm_model_func)
# 自定义的嵌入函数
embedding_func = create_embedding_func()
rag = RAGAnything(
config=config,
llm_model_func=llm_model_func,
vision_model_func=vision_model_func,
embedding_func=embedding_func,
)
# 需要注意注释掉将整理出来的文档内容插入到LightRAG的代码。
# await rag.process_document_complete(
# file_path=file_path,
# output_dir=output_dir,
# parse_method="auto",
# # MinerU特殊参数 - 支持的所有kwargs
# lang="ch", # 文档语言优化(如:"ch", "en", "ja"
# # device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps"
# # start_page=0, # 起始页码0为基准适用于PDF
# # end_page=10, # 结束页码0为基准适用于PDF
# formula=True, # 启用公式解析
# table=True, # 启用表格解析
# backend="pipeline", # 解析后端:"pipeline", "vlm-transformers"等
# source="local", # 模型源:"huggingface", "modelscope", "local"
#
# # RAGAnything标准参数
# display_stats=True, # 显示内容统计信息
# split_by_character=None, # 可选的文本分割字符
# doc_id=None, # 可选的文档ID
# )
"""
修正一下MinerU生成的Latex中如果是数字加圆圈的样式 \textcircled{1}
无法在Typora或者PyCharm中显示的问题,改成兼容性更强的 \enclose{circle}{1}
"""
path = r'../output/' + fileName + '/auto'
finalName = path + r'/' + fileName + '.md'
with open(finalName, 'r', encoding='utf-8') as f:
content = f.read()
content = content.replace(r'\textcircled', r'\enclose{circle}')
# 按【题型】分割试题
question_types = ["不定项选择", "单选题", "多选题", "填空题", "判断题", "完型填空题", "计算题"]
# 按 【题型】 分隔开
content = content.replace("\n\n", "\n")
questions = content.split('【题型】')
idx = 0
for q in questions:
# 干掉 【题型】前面的文档标题,比如: # 《动能定理》巩固练习
if idx == 0 and q != "":
idx = idx + 1
continue
# 干掉空行
if q.strip() == "" or q == '\n':
continue
# 如果q是以 question_types 中某个字符开头的,则在完成这个字符串后,换行输出
for x in question_types:
if q.startswith(x):
q= q.replace(" ","")
# q的x后面第一个字符是不是换行符\n,如果 不是,则添加一个\n
if q[q.index(x) + len(x)] != '\n':
q = q.replace(x, x + '\n')
break
q='【题型】'+q
print(q)
print("\n")
# with open(path + r'/测试.md', 'w', encoding='utf-8') as f:
# f.write(formatted_content)
# 将path目录下的images目录整体拷贝到 output下
shutil.rmtree(output_dir + r'/images')
shutil.copytree(path + r'/images', output_dir + r'/images')
# 删除path目录下
# shutil.rmtree(path)
if __name__ == "__main__":
asyncio.run(main())