'commit'

4 weeks ago · 558afe48f8
parent 5efa0d7564
commit 558afe48f8
8 changed files with 55 additions and 53 deletions
--- a/dsRag/Doc/MathType/MathType-win-zh-7.8.2.441.exe
+++ b/dsRag/Doc/MathType/MathType-win-zh-7.8.2.441.exe
--- a/dsRag/Doc/MathType/MathType_v7.x_Patch.exe
+++ b/dsRag/Doc/MathType/MathType_v7.x_Patch.exe
--- a/dsRag/Doc/MathType/录入化学方程式的办法.md
+++ b/dsRag/Doc/MathType/录入化学方程式的办法.md
--- a/dsRag/Doc/MathType/激活MathType.docx
+++ b/dsRag/Doc/MathType/激活MathType.docx
--- a/dsRag/Test/TestPandoc.py
+++ b/dsRag/Test/TestPandoc.py
@ -1,59 +1,12 @@
-import re
-import subprocess
-import os
-import uuid
-
-# 可执行文件路径
-mtef = r'D:\dsWork\dsProject\dsRag\mtef-go-3\mtef-go.exe'
-# docx文件路径
-docx_file = 'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx'
-
-
-# 结合Pandoc和mtef-go的结果，合并成最终的输出文本
-def get_docx_content_by_pandoc(f, formula_list):
-    # StringBuilder结果
-    sb = []
-    # output_file 设置为临时目录下的uuid.md
-    temp_markdown = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.md')
-    # 调用pandoc将docx文件转换成markdown
-    subprocess.run(['pandoc', f, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown])
-    # 读取然后修改内容，输出到新的文件
-    idx = 0
-    with open(temp_markdown, 'r', encoding='utf-8') as f:
-        for line in f:
-            if line.strip():
-                # 改进后的正则表达式，匹配更多格式的MathType公式
-                if re.search(r'!\[]\(media/image\d+\.\w+\)', line) or \
-                        re.search(r'\.!\[]\(media/image\d+\.\w+\)\.', line):
-                    sb.append(formula_list[idx])
-                    idx = idx + 1
-                else:
-                    sb.append(line.strip())
-
-    # 删除临时文件 output_file
-    os.remove(temp_markdown)
-    return sb
-
-
-# 获取MathType对应的Latex公式
-def getLatexList(docx_file):
-    res = []
-    output = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.txt')
-    command = mtef + r" -w " + docx_file + " -o " + output
-    os.system(command)
-    with open(output, 'r', encoding='utf-8') as file:
-        for i, line in enumerate(file):
-            res.append(line.strip())
-    return res
-
+from Util.DocxUtil import *

 if __name__ == '__main__':
-    # 一、获取Latex公式列表
-    formula_list = getLatexList(docx_file)
+    # docx文件路径
+    docx_file = 'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx'

-    # 二、整合最终的拼接完的文本
-    sb = get_docx_content_by_pandoc(docx_file, formula_list)
+    # 整合最终的拼接完的文本
+    sb = get_docx_content_by_pandoc(docx_file)

-    # 三、输出
+    # 输出
    for x in sb:
        print(x)
--- a/dsRag/Util/DocxUtil.py
+++ b/dsRag/Util/DocxUtil.py
@ -0,0 +1,49 @@
+import re
+import subprocess
+import os
+import uuid
+
+# 获取MathType对应的Latex公式
+def get_latex_list(docx_file):
+    # 获取当前目录的父级目录
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    current_dir = os.path.dirname(current_dir)
+    mtef = os.path.join(current_dir, 'mtef-go-3', 'mtef-go.exe')
+    res = []
+    output = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.txt')
+    command = mtef + r" -w " + docx_file + " -o " + output
+    os.system(command)
+    with open(output, 'r', encoding='utf-8') as file:
+        for i, line in enumerate(file):
+            res.append(line.strip())
+    return res
+
+# 结合Pandoc和mtef-go的结果，合并成最终的输出文本
+def get_docx_content_by_pandoc(docx_file):
+    # 一、获取Latex公式列表
+    formula_list = get_latex_list(docx_file)
+
+    # StringBuilder结果
+    sb = []
+    # output_file 设置为临时目录下的uuid.md
+    temp_markdown = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.md')
+    # 调用pandoc将docx文件转换成markdown
+    subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown])
+    # 读取然后修改内容，输出到新的文件
+    idx = 0
+    with open(temp_markdown, 'r', encoding='utf-8') as docx_file:
+        for line in docx_file:
+            if line.strip():
+                # 改进后的正则表达式，匹配更多格式的MathType公式
+                if re.search(r'!\[]\(media/image\d+\.\w+\)', line) or \
+                        re.search(r'\.!\[]\(media/image\d+\.\w+\)\.', line):
+                    sb.append(formula_list[idx])
+                    idx = idx + 1
+                else:
+                    sb.append(line.strip())
+
+    # 删除临时文件 output_file
+    os.remove(temp_markdown)
+    return sb
+
+
--- a/dsRag/Util/pycache/DocxUtil.cpython-310.pyc
+++ b/dsRag/Util/pycache/DocxUtil.cpython-310.pyc
--- a/dsRag/Util/pycache/init.cpython-310.pyc
+++ b/dsRag/Util/pycache/init.cpython-310.pyc