main
HuangHai 2 weeks ago
parent a72a1a67cc
commit 186639de89

@ -29,5 +29,5 @@ completion = client.chat.completions.create(
ocr_text = completion.choices[0].message.content
# 二、调用格式化函数处理内容
format_exam_content(raw_text=ocr_text, output_path="../output/数学OCR整理后的结果.md")
format_exam_content(raw_text=ocr_text, output_path="./output/数学OCR整理后的结果.md")
print("保存成功!")

@ -0,0 +1,59 @@
import asyncio
from Util.DocxUtil import get_docx_content_by_pandoc
async def main():
# 要处理的文件路径
file_path = "ShiTi/Docx/《动能定理》巩固练习.docx"
get_docx_content_by_pandoc(file_path)
# """
# 修正一下MinerU生成的Latex中如果是数字加圆圈的样式 \textcircled{1}
# 无法在Typora或者PyCharm中显示的问题,改成兼容性更强的 \enclose{circle}{1}
# """
# path = r'../output/' + fileName + '/auto'
# finalName = path + r'/' + fileName + '.md'
# formatted_content = ''
# with open(finalName, 'r', encoding='utf-8') as f:
# content = f.read()
# content = content.replace(r'\textcircled', r'\enclose{circle}')
# # 按【题型】分割试题
# question_types = ["不定项选择", "单选题", "多选题", "填空题", "判断题", "完型填空题", "计算题"]
#
# # 按 【题型】 分隔开
# content = content.replace("\n\n", "\n")
# # 从头开始找,找到第一个【题型】
# content = content[content.find('【题型】'):]
#
# questions = content.split('【题型】')
# idx = 0
# for q in questions:
# # 干掉空行
# if q.strip() == "" or q == '\n':
# continue
# # 如果q是以 question_types 中某个字符开头的,则在完成这个字符串后,换行输出
# for x in question_types:
# if q.startswith(x):
# q = q.replace(" ", "")
# # q的x后面第一个字符是不是换行符\n,如果 不是,则添加一个\n
# if q[q.index(x) + len(x)] != '\n':
# q = q.replace(x, x + '\n')
# break
#
# q = '【题型】' + q
# formatted_content = formatted_content + q + '\n'
#
# with open(path + r'/测试.md', 'w', encoding='utf-8') as f:
# f.write(formatted_content)
# # 将path目录下的images目录整体拷贝到 output下
# if os.path.exists(output_dir + r'/images'):
# shutil.rmtree(output_dir + r'/images')
# shutil.copytree(path + r'/images', output_dir + r'/images')
# # 删除path目录下
# # shutil.rmtree(path)
if __name__ == "__main__":
asyncio.run(main())

@ -1,126 +0,0 @@
import asyncio
import os
import shutil
from raganything import RAGAnything, RAGAnythingConfig
from Util.LightRagUtil import create_llm_model_func, create_embedding_func, create_vision_model_func, \
format_exam_content
import logging
# 在程序开始时添加以下配置
logging.basicConfig(
level=logging.INFO, # 设置日志级别为INFO
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# 更详细地控制日志输出
logger = logging.getLogger('lightrag')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
async def main():
# 要处理的文件路径
file_path = "Docx/《动能定理》巩固练习.docx"
WORKING_DIR = "../Topic/WuLi"
fileName = file_path.split('/')[-1].replace(".docx", "").replace(".doc", "")
# 删除output目录下的所有文件
output_dir = "../output"
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# 删除WORKING_DIR下的所有文件
shutil.rmtree(WORKING_DIR, ignore_errors=True)
os.makedirs(WORKING_DIR, exist_ok=True)
# 指定最终的索引生成目录,启动索引生成
config = RAGAnythingConfig(
working_dir=WORKING_DIR,
mineru_parse_method="auto",
enable_image_processing=True, # 处理图片
enable_table_processing=True, # 处理表格
enable_equation_processing=True, # 处理公式
)
# 自定义的大模型函数
llm_model_func = create_llm_model_func()
# 自定义的可视模型函数
vision_model_func = create_vision_model_func(llm_model_func)
# 自定义的嵌入函数
embedding_func = create_embedding_func()
rag = RAGAnything(
config=config,
llm_model_func=llm_model_func,
vision_model_func=vision_model_func,
embedding_func=embedding_func,
)
# 需要注意注释掉将整理出来的文档内容插入到LightRAG的代码。
# await rag.process_document_complete(
# file_path=file_path,
# output_dir=output_dir,
# parse_method="auto",
# # MinerU特殊参数 - 支持的所有kwargs
# lang="ch", # 文档语言优化(如:"ch", "en", "ja"
# # device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps"
# # start_page=0, # 起始页码0为基准适用于PDF
# # end_page=10, # 结束页码0为基准适用于PDF
# formula=True, # 启用公式解析
# table=True, # 启用表格解析
# backend="pipeline", # 解析后端:"pipeline", "vlm-transformers"等
# source="local", # 模型源:"huggingface", "modelscope", "local"
#
# # RAGAnything标准参数
# display_stats=True, # 显示内容统计信息
# split_by_character=None, # 可选的文本分割字符
# doc_id=None, # 可选的文档ID
# )
"""
修正一下MinerU生成的Latex中如果是数字加圆圈的样式 \textcircled{1}
无法在Typora或者PyCharm中显示的问题,改成兼容性更强的 \enclose{circle}{1}
"""
path = r'../output/' + fileName + '/auto'
finalName = path + r'/' + fileName + '.md'
formatted_content = ''
with open(finalName, 'r', encoding='utf-8') as f:
content = f.read()
content = content.replace(r'\textcircled', r'\enclose{circle}')
# 按【题型】分割试题
question_types = ["不定项选择", "单选题", "多选题", "填空题", "判断题", "完型填空题", "计算题"]
# 按 【题型】 分隔开
content = content.replace("\n\n", "\n")
# 从头开始找,找到第一个【题型】
content = content[content.find('【题型】'):]
questions = content.split('【题型】')
idx = 0
for q in questions:
# 干掉空行
if q.strip() == "" or q == '\n':
continue
# 如果q是以 question_types 中某个字符开头的,则在完成这个字符串后,换行输出
for x in question_types:
if q.startswith(x):
q = q.replace(" ", "")
# q的x后面第一个字符是不是换行符\n,如果 不是,则添加一个\n
if q[q.index(x) + len(x)] != '\n':
q = q.replace(x, x + '\n')
break
q = '【题型】' + q
formatted_content = formatted_content + q + '\n'
with open(path + r'/测试.md', 'w', encoding='utf-8') as f:
f.write(formatted_content)
# 将path目录下的images目录整体拷贝到 output下
if os.path.exists(output_dir + r'/images'):
shutil.rmtree(output_dir + r'/images')
shutil.copytree(path + r'/images', output_dir + r'/images')
# 删除path目录下
# shutil.rmtree(path)
if __name__ == "__main__":
asyncio.run(main())
Loading…
Cancel
Save