'commit'

3 weeks ago · a0ab4deb3e
parent f5bb278a8d
commit a0ab4deb3e
2 changed files with 47 additions and 53 deletions
--- a/dsRag/Test/TestPandoc.py
+++ b/dsRag/Test/TestPandoc.py
@ -1,36 +1,61 @@
 import re
 import subprocess
 import os
+import uuid

-def html_to_word_pandoc(html_file, output_file):
-    subprocess.run(['pandoc', html_file, '-o', output_file])
-
-# docx 转 markdown
-def docx_to_markdown_pandoc(docx_file, output_file):
-    subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', output_file])
+# 可执行文件路径
+mtef = r'D:\dsWork\dsProject\dsRag\mtef-go-3\mtef-go.exe'
+# docx文件路径
+docx_file = 'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx'
+# 处理完成后的文件路径
+t1 = "c:/final.txt"


-docx_file = 'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx'
-output_file = 'c:/output.md'
-docx_to_markdown_pandoc(docx_file, output_file)
-finalFile = "c:/new.txt"
-
-# 读取然后修改内容，输出到新的文件
-idx = 0
-with open(finalFile, 'w', encoding='utf-8') as f1:
-    with open(output_file, 'r', encoding='utf-8') as f:
+# 结合Pandoc和mtef-go的结果，合并成最终的输出文本
+def get_docx_content_by_pandoc(f, formula_list):
+    # StringBuilder结果
+    sb = []
+    # output_file 设置为临时目录下的uuid.md
+    temp_markdown = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.md')
+    # 调用pandoc将docx文件转换成markdown
+    subprocess.run(['pandoc', f, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown])
+    # 读取然后修改内容，输出到新的文件
+    idx = 0
+    with open(temp_markdown, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                # 改进后的正则表达式，匹配更多格式的MathType公式
                if re.search(r'!\[]\(media/image\d+\.\w+\)', line) or \
                        re.search(r'\.!\[]\(media/image\d+\.\w+\)\.', line):
+                    sb.append(formula_list[idx])
                    idx = idx + 1
-                    f1.write("【MathType" + str(idx) + "】\n")
                else:
-                    f1.write(line.strip() + "\n")
-# 删除临时文件 output_file
-os.remove(output_file)
+                    sb.append(line.strip())
+
+    # 删除临时文件 output_file
+    os.remove(temp_markdown)
+    return sb
+
+
+# 获取MathType对应的Latex公式
+def get_MathType_by_mtef(docx_file):
+    res = []
+    output = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.txt')
+    command = mtef + r" -w " + docx_file + " -o " + output
+    os.system(command)
+    with open(output, 'r', encoding='utf-8') as file:
+        for i, line in enumerate(file):
+            res.append(line.strip())
+    return res
+
+
+if __name__ == '__main__':
+    # 一、输出MathType对应的Latex公式
+    formula_list = get_MathType_by_mtef(docx_file)
+    # print(formula_list)
+
+    # 二、获取docx文件的内容
+    sb = get_docx_content_by_pandoc(docx_file, formula_list)

-# 输出finalFile
-with open(finalFile, 'r', encoding='utf-8') as f:
-    print(f.read())
+    for x in sb:
+        print(x)
--- a/dsRag/Test/TestReadSpire.py
+++ b/dsRag/Test/TestReadSpire.py
@ -1,31 +0,0 @@
-# https://www.e-iceblue.cn/doc_python_other/python-insert-or-extract-ole-objects-in-word.html
-# pip install Spire.Doc
-from spire.doc import *
-
-# 创建Document类的对象
-doc = Document()
-# 加载Word文档
-doc.LoadFromFile(r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx')
-
-i = 1
-# 遍历Word文档的所有节
-for k in range(doc.Sections.Count):
-    sec = doc.Sections.get_Item(k)
-    # 遍历每个节的所有子对象
-    for j in range(sec.Body.ChildObjects.Count):
-        obj = sec.Body.ChildObjects.get_Item(j)
-        # 检查子对象是否为段落
-        if isinstance(obj, Paragraph):
-            par = obj if isinstance(obj, Paragraph) else None
-            # 遍历段落中的子对象
-            for m in range(par.ChildObjects.Count):
-                o = par.ChildObjects.get_Item(m)
-                # 检查子对象是否为OLE对象
-                if o.DocumentObjectType == DocumentObjectType.OleObject:
-                    ole = o if isinstance(o, DocOleObject) else None
-                    s = ole.ObjectType
-                    if s.startswith("Equation.DSMT4"):
-                        ext = ".mathtype"
-                        print("equation")
-
-doc.Close()