From 558afe48f817b0c03001f85b8524f39424e9f176 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 30 Jun 2025 19:48:17 +0800 Subject: [PATCH] 'commit' --- .../MathType/MathType-win-zh-7.8.2.441.exe | Bin .../MathType/MathType_v7.x_Patch.exe | Bin .../录入化学方程式的办法.md | 0 dsRag/{ => Doc}/MathType/激活MathType.docx | Bin dsRag/Test/TestPandoc.py | 59 ++---------------- dsRag/Util/DocxUtil.py | 49 +++++++++++++++ .../Util/__pycache__/DocxUtil.cpython-310.pyc | Bin 0 -> 1398 bytes .../Util/__pycache__/__init__.cpython-310.pyc | Bin 135 -> 135 bytes 8 files changed, 55 insertions(+), 53 deletions(-) rename dsRag/{ => Doc}/MathType/MathType-win-zh-7.8.2.441.exe (100%) rename dsRag/{ => Doc}/MathType/MathType_v7.x_Patch.exe (100%) rename dsRag/{ => Doc}/MathType/录入化学方程式的办法.md (100%) rename dsRag/{ => Doc}/MathType/激活MathType.docx (100%) create mode 100644 dsRag/Util/DocxUtil.py create mode 100644 dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc diff --git a/dsRag/MathType/MathType-win-zh-7.8.2.441.exe b/dsRag/Doc/MathType/MathType-win-zh-7.8.2.441.exe similarity index 100% rename from dsRag/MathType/MathType-win-zh-7.8.2.441.exe rename to dsRag/Doc/MathType/MathType-win-zh-7.8.2.441.exe diff --git a/dsRag/MathType/MathType_v7.x_Patch.exe b/dsRag/Doc/MathType/MathType_v7.x_Patch.exe similarity index 100% rename from dsRag/MathType/MathType_v7.x_Patch.exe rename to dsRag/Doc/MathType/MathType_v7.x_Patch.exe diff --git a/dsRag/MathType/录入化学方程式的办法.md b/dsRag/Doc/MathType/录入化学方程式的办法.md similarity index 100% rename from dsRag/MathType/录入化学方程式的办法.md rename to dsRag/Doc/MathType/录入化学方程式的办法.md diff --git a/dsRag/MathType/激活MathType.docx b/dsRag/Doc/MathType/激活MathType.docx similarity index 100% rename from dsRag/MathType/激活MathType.docx rename to dsRag/Doc/MathType/激活MathType.docx diff --git a/dsRag/Test/TestPandoc.py b/dsRag/Test/TestPandoc.py index 4cb01dd7..ad71fa35 100644 --- a/dsRag/Test/TestPandoc.py +++ b/dsRag/Test/TestPandoc.py @@ -1,59 +1,12 @@ -import re -import subprocess -import os -import uuid - -# 可执行文件路径 -mtef = r'D:\dsWork\dsProject\dsRag\mtef-go-3\mtef-go.exe' -# docx文件路径 -docx_file = 'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx' - - -# 结合Pandoc和mtef-go的结果,合并成最终的输出文本 -def get_docx_content_by_pandoc(f, formula_list): - # StringBuilder结果 - sb = [] - # output_file 设置为临时目录下的uuid.md - temp_markdown = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.md') - # 调用pandoc将docx文件转换成markdown - subprocess.run(['pandoc', f, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown]) - # 读取然后修改内容,输出到新的文件 - idx = 0 - with open(temp_markdown, 'r', encoding='utf-8') as f: - for line in f: - if line.strip(): - # 改进后的正则表达式,匹配更多格式的MathType公式 - if re.search(r'!\[]\(media/image\d+\.\w+\)', line) or \ - re.search(r'\.!\[]\(media/image\d+\.\w+\)\.', line): - sb.append(formula_list[idx]) - idx = idx + 1 - else: - sb.append(line.strip()) - - # 删除临时文件 output_file - os.remove(temp_markdown) - return sb - - -# 获取MathType对应的Latex公式 -def getLatexList(docx_file): - res = [] - output = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.txt') - command = mtef + r" -w " + docx_file + " -o " + output - os.system(command) - with open(output, 'r', encoding='utf-8') as file: - for i, line in enumerate(file): - res.append(line.strip()) - return res - +from Util.DocxUtil import * if __name__ == '__main__': - # 一、获取Latex公式列表 - formula_list = getLatexList(docx_file) + # docx文件路径 + docx_file = 'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx' - # 二、整合最终的拼接完的文本 - sb = get_docx_content_by_pandoc(docx_file, formula_list) + # 整合最终的拼接完的文本 + sb = get_docx_content_by_pandoc(docx_file) - # 三、输出 + # 输出 for x in sb: print(x) diff --git a/dsRag/Util/DocxUtil.py b/dsRag/Util/DocxUtil.py new file mode 100644 index 00000000..83946eff --- /dev/null +++ b/dsRag/Util/DocxUtil.py @@ -0,0 +1,49 @@ +import re +import subprocess +import os +import uuid + +# 获取MathType对应的Latex公式 +def get_latex_list(docx_file): + # 获取当前目录的父级目录 + current_dir = os.path.dirname(os.path.abspath(__file__)) + current_dir = os.path.dirname(current_dir) + mtef = os.path.join(current_dir, 'mtef-go-3', 'mtef-go.exe') + res = [] + output = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.txt') + command = mtef + r" -w " + docx_file + " -o " + output + os.system(command) + with open(output, 'r', encoding='utf-8') as file: + for i, line in enumerate(file): + res.append(line.strip()) + return res + +# 结合Pandoc和mtef-go的结果,合并成最终的输出文本 +def get_docx_content_by_pandoc(docx_file): + # 一、获取Latex公式列表 + formula_list = get_latex_list(docx_file) + + # StringBuilder结果 + sb = [] + # output_file 设置为临时目录下的uuid.md + temp_markdown = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.md') + # 调用pandoc将docx文件转换成markdown + subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown]) + # 读取然后修改内容,输出到新的文件 + idx = 0 + with open(temp_markdown, 'r', encoding='utf-8') as docx_file: + for line in docx_file: + if line.strip(): + # 改进后的正则表达式,匹配更多格式的MathType公式 + if re.search(r'!\[]\(media/image\d+\.\w+\)', line) or \ + re.search(r'\.!\[]\(media/image\d+\.\w+\)\.', line): + sb.append(formula_list[idx]) + idx = idx + 1 + else: + sb.append(line.strip()) + + # 删除临时文件 output_file + os.remove(temp_markdown) + return sb + + diff --git a/dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc b/dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe0e16eede2dbb1c2107097bfc9e2a2989d312e3 GIT binary patch literal 1398 zcmaKr-EQ1O6vyYwUhijCC=jKTG*t!aQe^?r3xp7=M7V>fBA}|S%gWiAY@Dt=+wr*B ztZWG>+#+7Vd)fEk5xAOLuKEV35(?)yMT8q>H2-tv%=gT1bidyrkRG2Oo$R*=`2#E0 z6NAbFDEWO5NhF<<@XFGVHgy)V#`2I$Ci%}K6jDeJtS9Lq8TqeZBsrK;G;dNHYyjme zDEV(7mYmX>HOj46(>1N=RjmBi_ev_MWNTir8L4>F?t%J}Zxh?9gEcvAOR+`PtP&-Y z-WDmz7Lh(!h*r?I?v&n|QewNMkS!U=_AmU`?aH^k(!ZelIM?U3K5MW3^OHeE;>g4_9Jr zmv$v~_s(`v^{%7lwO82Vz0Y1!7pQEiWs=QC8;