You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

71 lines
2.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re
import subprocess
import os
import uuid
# 获取MathType对应的Latex公式
def get_latex_list(docx_file):
# 获取当前目录的父级目录
current_dir = os.path.dirname(os.path.abspath(__file__))
current_dir = os.path.dirname(current_dir)
mtef = os.path.join(current_dir, 'mtef-go-3', 'mtef-go.exe')
res = []
output = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.txt')
command = mtef + r" -w " + docx_file + " -o " + output
os.system(command)
# 如果output没有正确生成
if not os.path.exists(output):
return res
with open(output, 'r', encoding='utf-8') as file:
for i, line in enumerate(file):
res.append(line.strip())
return res
# 结合Pandoc和mtef-go的结果合并成最终的输出文本
def get_docx_content_by_pandoc(docx_file):
# 一、获取Latex公式列表
formula_list = get_latex_list(docx_file)
# 最后拼接的内容
content = ""
# output_file 设置为临时目录下的uuid.md
temp_markdown = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.md')
# 调用pandoc将docx文件转换成markdown
subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown])
# 打印 temp_markdown 文件
# with open(temp_markdown, 'r', encoding='utf-8') as f:
# print(f.read())
# 读取然后修改内容,输出到新的文件
wmf_idx = 0 # wmf索引
img_idx = 0 # 图片索引
with open(temp_markdown, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
# 跳过图片高度描述行
if line.startswith('height=') and line.endswith('in"}'):
continue
# 使用find()方法安全地检查图片模式
is_wmf = line.find("![](media/image") >= 0 and line.find(".wmf") > 0
is_img = line.find("![](media/image") >= 0 and (
line.find(".png") > 0 or
line.find(".jpg") > 0 or
line.find(".jpeg") > 0
)
if is_wmf:
content += formula_list[wmf_idx] + "\n"
wmf_idx += 1
elif is_img:
#![](media/image3.png){width="3.1251607611548557in"
# height="3.694634733158355in"}
img_idx += 1
content += "【图片" + str(img_idx) + "\n"
else:
content += line.strip() + "\n"
# 删除临时文件 output_file
os.remove(temp_markdown)
return content