main
HuangHai 3 weeks ago
parent 3ce3f1afc3
commit eaa82ea649

@ -1,2 +1,5 @@
https://github.com/jgm/pandoc/releases/tag/3.7.0.2
https://objects.githubusercontent.com/github-production-release-asset-2e65be/571770/92882bf5-3b76-4345-b08a-9d9badc74957?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250626%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250626T231242Z&X-Amz-Expires=1800&X-Amz-Signature=55493529bc6e5a3779e95bcdd9f33cf09477d3e47f9a441b9412b5b193d788db&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dpandoc-3.7.0.2-windows-x86_64.msi&response-content-type=application%2Foctet-stream
https://objects.githubusercontent.com/github-production-release-asset-2e65be/571770/92882bf5-3b76-4345-b08a-9d9badc74957?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250626%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250626T231242Z&X-Amz-Expires=1800&X-Amz-Signature=55493529bc6e5a3779e95bcdd9f33cf09477d3e47f9a441b9412b5b193d788db&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dpandoc-3.7.0.2-windows-x86_64.msi&response-content-type=application%2Foctet-stream
pandoc -f docx -t markdown -o c:/aaa.md D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx

@ -1,7 +1,34 @@
import re
import subprocess
def html_to_word_pandoc(html_file, output_file):
subprocess.run(['pandoc', html_file, '-o', output_file])
# 使用示例
html_to_word_pandoc('../static/1.html', '../static/output.docx')
def docx_to_markdown_pandoc(docx_file, output_file):
subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', output_file])
docx_file = 'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx'
output_file = 'c:/output.md'
docx_to_markdown_pandoc(docx_file, output_file)
finalFile = "c:/new.txt"
# 读取然后修改内容,输出到新的文件
idx=0
with open(finalFile, 'w', encoding='utf-8') as f1:
with open(output_file, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
# 改进后的正则表达式匹配更多格式的MathType公式
if re.search(r'!\[]\(media/image\d+\.\w+\)', line) or \
re.search(r'\.!\[]\(media/image\d+\.\w+\)\.', line):
idx=idx+1
f1.write("【MathType"+str(idx)+"\n")
else:
f1.write(line.strip()+"\n")
# 输出finalFile
with open(finalFile, 'r', encoding='utf-8') as f:
print(f.read())

@ -1,7 +1,6 @@
# https://www.e-iceblue.cn/doc_python_other/python-insert-or-extract-ole-objects-in-word.html
# pip install Spire.Doc
from spire.doc import *
from spire.doc.common import *
# 创建Document类的对象
doc = Document()

Loading…
Cancel
Save