You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.
import re
import subprocess
def html_to_word_pandoc ( html_file , output_file ) :
subprocess . run ( [ ' pandoc ' , html_file , ' -o ' , output_file ] )
def docx_to_markdown_pandoc ( docx_file , output_file ) :
subprocess . run ( [ ' pandoc ' , docx_file , ' -f ' , ' docx ' , ' -t ' , ' markdown ' , ' -o ' , output_file ] )
docx_file = ' D: \ dsWork \ dsProject \ dsRag \ static \ Txt \ 化学方程式_CHEMISTRY_1.docx '
output_file = ' c:/output.md '
docx_to_markdown_pandoc ( docx_file , output_file )
finalFile = " c:/new.txt "
# 读取然后修改内容,输出到新的文件
idx = 0
with open ( finalFile , ' w ' , encoding = ' utf-8 ' ) as f1 :
with open ( output_file , ' r ' , encoding = ' utf-8 ' ) as f :
for line in f :
if line . strip ( ) :
# 改进后的正则表达式, 匹配更多格式的MathType公式
if re . search ( r ' ! \ [] \ (media/image \ d+ \ . \ w+ \ ) ' , line ) or \
re . search ( r ' \ .! \ [] \ (media/image \ d+ \ . \ w+ \ ) \ . ' , line ) :
idx = idx + 1
f1 . write ( " 【MathType " + str ( idx ) + " 】 \n " )
else :
f1 . write ( line . strip ( ) + " \n " )
# 输出finalFile
with open ( finalFile , ' r ' , encoding = ' utf-8 ' ) as f :
print ( f . read ( ) )