main
HuangHai 3 weeks ago
parent 7356d165ba
commit 41655b2b91

@ -30,16 +30,29 @@ def get_docx_content_by_pandoc(docx_file):
temp_markdown = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.md')
# 调用pandoc将docx文件转换成markdown
subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown])
# 打印 temp_markdown 文件
# with open(temp_markdown, 'r', encoding='utf-8') as f:
# print(f.read())
# 读取然后修改内容,输出到新的文件
idx = 0
with open(temp_markdown, 'r', encoding='utf-8') as docx_file:
for line in docx_file:
wmf_idx = 0 # wmf索引
img_idx = 0 # 图片索引
with open(temp_markdown, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
# 改进后的正则表达式匹配更多格式的MathType公式
if re.search(r'!\[]\(media/image\d+\.\w+\)', line) or \
re.search(r'\.!\[]\(media/image\d+\.\w+\)\.', line):
content = content + formula_list[idx] + "\n"
idx = idx + 1
"""
![](media/image1.wmf)
问题2 氢气与氧气燃烧的方程式
.![](media/image2.wmf).
问题3 我是一个图片
![](media/image3.png){width="3.1251607611548557in" height="3.694634733158355in"}
"""
if line.index("![](media/image") >= 0 and line.index(".wmf") > 0:
content = content + formula_list[wmf_idx] + "\n"
wmf_idx = wmf_idx + 1
elif line.index("![](media/image") >= 0 and (
line.index(".png") > 0 or line.index(".jpg") > 0 or line.index(".jpeg") > 0):
content = content + "【图片" + str(img_idx) + "\n"
else:
content = content + line.strip() + "\n"

Loading…
Cancel
Save