diff --git a/dsRag/Test/TestPandoc.py b/dsRag/Test/TestPandoc.py index b102e2c3..623e14ff 100644 --- a/dsRag/Test/TestPandoc.py +++ b/dsRag/Test/TestPandoc.py @@ -1,36 +1,61 @@ import re import subprocess import os +import uuid -def html_to_word_pandoc(html_file, output_file): - subprocess.run(['pandoc', html_file, '-o', output_file]) - -# docx 转 markdown -def docx_to_markdown_pandoc(docx_file, output_file): - subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', output_file]) +# 可执行文件路径 +mtef = r'D:\dsWork\dsProject\dsRag\mtef-go-3\mtef-go.exe' +# docx文件路径 +docx_file = 'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx' +# 处理完成后的文件路径 +t1 = "c:/final.txt" -docx_file = 'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx' -output_file = 'c:/output.md' -docx_to_markdown_pandoc(docx_file, output_file) -finalFile = "c:/new.txt" - -# 读取然后修改内容,输出到新的文件 -idx = 0 -with open(finalFile, 'w', encoding='utf-8') as f1: - with open(output_file, 'r', encoding='utf-8') as f: +# 结合Pandoc和mtef-go的结果,合并成最终的输出文本 +def get_docx_content_by_pandoc(f, formula_list): + # StringBuilder结果 + sb = [] + # output_file 设置为临时目录下的uuid.md + temp_markdown = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.md') + # 调用pandoc将docx文件转换成markdown + subprocess.run(['pandoc', f, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown]) + # 读取然后修改内容,输出到新的文件 + idx = 0 + with open(temp_markdown, 'r', encoding='utf-8') as f: for line in f: if line.strip(): # 改进后的正则表达式,匹配更多格式的MathType公式 if re.search(r'!\[]\(media/image\d+\.\w+\)', line) or \ re.search(r'\.!\[]\(media/image\d+\.\w+\)\.', line): + sb.append(formula_list[idx]) idx = idx + 1 - f1.write("【MathType" + str(idx) + "】\n") else: - f1.write(line.strip() + "\n") -# 删除临时文件 output_file -os.remove(output_file) + sb.append(line.strip()) + + # 删除临时文件 output_file + os.remove(temp_markdown) + return sb + + +# 获取MathType对应的Latex公式 +def get_MathType_by_mtef(docx_file): + res = [] + output = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.txt') + command = mtef + r" -w " + docx_file + " -o " + output + os.system(command) + with open(output, 'r', encoding='utf-8') as file: + for i, line in enumerate(file): + res.append(line.strip()) + return res + + +if __name__ == '__main__': + # 一、输出MathType对应的Latex公式 + formula_list = get_MathType_by_mtef(docx_file) + # print(formula_list) + + # 二、获取docx文件的内容 + sb = get_docx_content_by_pandoc(docx_file, formula_list) -# 输出finalFile -with open(finalFile, 'r', encoding='utf-8') as f: - print(f.read()) + for x in sb: + print(x) \ No newline at end of file diff --git a/dsRag/Test/TestReadSpire.py b/dsRag/Test/TestReadSpire.py deleted file mode 100644 index 4c18a1bd..00000000 --- a/dsRag/Test/TestReadSpire.py +++ /dev/null @@ -1,31 +0,0 @@ -# https://www.e-iceblue.cn/doc_python_other/python-insert-or-extract-ole-objects-in-word.html -# pip install Spire.Doc -from spire.doc import * - -# 创建Document类的对象 -doc = Document() -# 加载Word文档 -doc.LoadFromFile(r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx') - -i = 1 -# 遍历Word文档的所有节 -for k in range(doc.Sections.Count): - sec = doc.Sections.get_Item(k) - # 遍历每个节的所有子对象 - for j in range(sec.Body.ChildObjects.Count): - obj = sec.Body.ChildObjects.get_Item(j) - # 检查子对象是否为段落 - if isinstance(obj, Paragraph): - par = obj if isinstance(obj, Paragraph) else None - # 遍历段落中的子对象 - for m in range(par.ChildObjects.Count): - o = par.ChildObjects.get_Item(m) - # 检查子对象是否为OLE对象 - if o.DocumentObjectType == DocumentObjectType.OleObject: - ole = o if isinstance(o, DocOleObject) else None - s = ole.ObjectType - if s.startswith("Equation.DSMT4"): - ext = ".mathtype" - print("equation") - -doc.Close() \ No newline at end of file