You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

61 lines
2.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re
import subprocess
import os
import uuid
# 可执行文件路径
mtef = r'D:\dsWork\dsProject\dsRag\mtef-go-3\mtef-go.exe'
# docx文件路径
docx_file = 'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx'
# 处理完成后的文件路径
t1 = "c:/final.txt"
# 结合Pandoc和mtef-go的结果合并成最终的输出文本
def get_docx_content_by_pandoc(f, formula_list):
# StringBuilder结果
sb = []
# output_file 设置为临时目录下的uuid.md
temp_markdown = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.md')
# 调用pandoc将docx文件转换成markdown
subprocess.run(['pandoc', f, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown])
# 读取然后修改内容,输出到新的文件
idx = 0
with open(temp_markdown, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
# 改进后的正则表达式匹配更多格式的MathType公式
if re.search(r'!\[]\(media/image\d+\.\w+\)', line) or \
re.search(r'\.!\[]\(media/image\d+\.\w+\)\.', line):
sb.append(formula_list[idx])
idx = idx + 1
else:
sb.append(line.strip())
# 删除临时文件 output_file
os.remove(temp_markdown)
return sb
# 获取MathType对应的Latex公式
def get_MathType_by_mtef(docx_file):
res = []
output = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.txt')
command = mtef + r" -w " + docx_file + " -o " + output
os.system(command)
with open(output, 'r', encoding='utf-8') as file:
for i, line in enumerate(file):
res.append(line.strip())
return res
if __name__ == '__main__':
# 一、输出MathType对应的Latex公式
formula_list = get_MathType_by_mtef(docx_file)
# print(formula_list)
# 二、获取docx文件的内容
sb = get_docx_content_by_pandoc(docx_file, formula_list)
for x in sb:
print(x)