diff --git a/dsRag/Util/DocxUtil.py b/dsRag/Util/DocxUtil.py index fdad080e..d4b7b7c7 100644 --- a/dsRag/Util/DocxUtil.py +++ b/dsRag/Util/DocxUtil.py @@ -30,16 +30,29 @@ def get_docx_content_by_pandoc(docx_file): temp_markdown = os.path.join(os.environ['TEMP'], uuid.uuid4().hex + '.md') # 调用pandoc将docx文件转换成markdown subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown]) + # 打印 temp_markdown 文件 + # with open(temp_markdown, 'r', encoding='utf-8') as f: + # print(f.read()) # 读取然后修改内容,输出到新的文件 - idx = 0 - with open(temp_markdown, 'r', encoding='utf-8') as docx_file: - for line in docx_file: + wmf_idx = 0 # wmf索引 + img_idx = 0 # 图片索引 + with open(temp_markdown, 'r', encoding='utf-8') as f: + for line in f: if line.strip(): # 改进后的正则表达式,匹配更多格式的MathType公式 - if re.search(r'!\[]\(media/image\d+\.\w+\)', line) or \ - re.search(r'\.!\[]\(media/image\d+\.\w+\)\.', line): - content = content + formula_list[idx] + "\n" - idx = idx + 1 + """ + ![](media/image1.wmf) + 问题2 氢气与氧气燃烧的方程式 + .![](media/image2.wmf). + 问题3 我是一个图片 + ![](media/image3.png){width="3.1251607611548557in" height="3.694634733158355in"} + """ + if line.index("![](media/image") >= 0 and line.index(".wmf") > 0: + content = content + formula_list[wmf_idx] + "\n" + wmf_idx = wmf_idx + 1 + elif line.index("![](media/image") >= 0 and ( + line.index(".png") > 0 or line.index(".jpg") > 0 or line.index(".jpeg") > 0): + content = content + "【图片" + str(img_idx) + "】\n" else: content = content + line.strip() + "\n" diff --git a/dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc b/dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc index ae1b8836..bcca2d20 100644 Binary files a/dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc and b/dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc differ diff --git a/dsRag/static/Txt/化学方程式_CHEMISTRY_1.docx b/dsRag/static/Txt/化学方程式_CHEMISTRY_1.docx index 4852a2a3..04d1a434 100644 Binary files a/dsRag/static/Txt/化学方程式_CHEMISTRY_1.docx and b/dsRag/static/Txt/化学方程式_CHEMISTRY_1.docx differ