You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

37 lines
1.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re
import subprocess
import os
def html_to_word_pandoc(html_file, output_file):
subprocess.run(['pandoc', html_file, '-o', output_file])
# docx 转 markdown
def docx_to_markdown_pandoc(docx_file, output_file):
subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', output_file])
docx_file = 'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx'
output_file = 'c:/output.md'
docx_to_markdown_pandoc(docx_file, output_file)
finalFile = "c:/new.txt"
# 读取然后修改内容,输出到新的文件
idx = 0
with open(finalFile, 'w', encoding='utf-8') as f1:
with open(output_file, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
# 改进后的正则表达式匹配更多格式的MathType公式
if re.search(r'!\[]\(media/image\d+\.\w+\)', line) or \
re.search(r'\.!\[]\(media/image\d+\.\w+\)\.', line):
idx = idx + 1
f1.write("【MathType" + str(idx) + "\n")
else:
f1.write(line.strip() + "\n")
# 删除临时文件 output_file
os.remove(output_file)
# 输出finalFile
with open(finalFile, 'r', encoding='utf-8') as f:
print(f.read())