From eaa82ea6490f602857f40a0514a965e50de0635d Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 30 Jun 2025 18:34:07 +0800 Subject: [PATCH] 'commit' --- dsRag/Doc/8、Pandoc下载.md | 5 ++++- dsRag/Test/TestPandoc.py | 31 +++++++++++++++++++++++++++++-- dsRag/Test/TestReadSpire.py | 1 - 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/dsRag/Doc/8、Pandoc下载.md b/dsRag/Doc/8、Pandoc下载.md index 674e9ba8..a365178c 100644 --- a/dsRag/Doc/8、Pandoc下载.md +++ b/dsRag/Doc/8、Pandoc下载.md @@ -1,2 +1,5 @@ https://github.com/jgm/pandoc/releases/tag/3.7.0.2 -https://objects.githubusercontent.com/github-production-release-asset-2e65be/571770/92882bf5-3b76-4345-b08a-9d9badc74957?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250626%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250626T231242Z&X-Amz-Expires=1800&X-Amz-Signature=55493529bc6e5a3779e95bcdd9f33cf09477d3e47f9a441b9412b5b193d788db&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dpandoc-3.7.0.2-windows-x86_64.msi&response-content-type=application%2Foctet-stream \ No newline at end of file +https://objects.githubusercontent.com/github-production-release-asset-2e65be/571770/92882bf5-3b76-4345-b08a-9d9badc74957?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250626%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250626T231242Z&X-Amz-Expires=1800&X-Amz-Signature=55493529bc6e5a3779e95bcdd9f33cf09477d3e47f9a441b9412b5b193d788db&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dpandoc-3.7.0.2-windows-x86_64.msi&response-content-type=application%2Foctet-stream + + +pandoc -f docx -t markdown -o c:/aaa.md D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx \ No newline at end of file diff --git a/dsRag/Test/TestPandoc.py b/dsRag/Test/TestPandoc.py index e004310c..7fcdd4a9 100644 --- a/dsRag/Test/TestPandoc.py +++ b/dsRag/Test/TestPandoc.py @@ -1,7 +1,34 @@ +import re import subprocess + def html_to_word_pandoc(html_file, output_file): subprocess.run(['pandoc', html_file, '-o', output_file]) -# 使用示例 -html_to_word_pandoc('../static/1.html', '../static/output.docx') \ No newline at end of file + +def docx_to_markdown_pandoc(docx_file, output_file): + subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', output_file]) + + +docx_file = 'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx' +output_file = 'c:/output.md' +docx_to_markdown_pandoc(docx_file, output_file) +finalFile = "c:/new.txt" + +# 读取然后修改内容,输出到新的文件 +idx=0 +with open(finalFile, 'w', encoding='utf-8') as f1: + with open(output_file, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + # 改进后的正则表达式,匹配更多格式的MathType公式 + if re.search(r'!\[]\(media/image\d+\.\w+\)', line) or \ + re.search(r'\.!\[]\(media/image\d+\.\w+\)\.', line): + idx=idx+1 + f1.write("【MathType"+str(idx)+"】\n") + else: + f1.write(line.strip()+"\n") + +# 输出finalFile +with open(finalFile, 'r', encoding='utf-8') as f: + print(f.read()) \ No newline at end of file diff --git a/dsRag/Test/TestReadSpire.py b/dsRag/Test/TestReadSpire.py index 3b418278..4c18a1bd 100644 --- a/dsRag/Test/TestReadSpire.py +++ b/dsRag/Test/TestReadSpire.py @@ -1,7 +1,6 @@ # https://www.e-iceblue.cn/doc_python_other/python-insert-or-extract-ole-objects-in-word.html # pip install Spire.Doc from spire.doc import * -from spire.doc.common import * # 创建Document类的对象 doc = Document()