diff --git a/dsRag/Util/DocxUtil.py b/dsRag/Util/DocxUtil.py index d4b7b7c7..bb9fcfb5 100644 --- a/dsRag/Util/DocxUtil.py +++ b/dsRag/Util/DocxUtil.py @@ -38,23 +38,29 @@ def get_docx_content_by_pandoc(docx_file): img_idx = 0 # 图片索引 with open(temp_markdown, 'r', encoding='utf-8') as f: for line in f: - if line.strip(): - # 改进后的正则表达式,匹配更多格式的MathType公式 - """ - ![](media/image1.wmf) - 问题2 氢气与氧气燃烧的方程式 - .![](media/image2.wmf). - 问题3 我是一个图片 - ![](media/image3.png){width="3.1251607611548557in" height="3.694634733158355in"} - """ - if line.index("![](media/image") >= 0 and line.index(".wmf") > 0: - content = content + formula_list[wmf_idx] + "\n" - wmf_idx = wmf_idx + 1 - elif line.index("![](media/image") >= 0 and ( - line.index(".png") > 0 or line.index(".jpg") > 0 or line.index(".jpeg") > 0): - content = content + "【图片" + str(img_idx) + "】\n" - else: - content = content + line.strip() + "\n" + line = line.strip() + if not line: + continue + # 跳过图片高度描述行 + if line.startswith('height=') and line.endswith('in"}'): + continue + # 使用find()方法安全地检查图片模式 + is_wmf = line.find("![](media/image") >= 0 and line.find(".wmf") > 0 + is_img = line.find("![](media/image") >= 0 and ( + line.find(".png") > 0 or + line.find(".jpg") > 0 or + line.find(".jpeg") > 0 + ) + if is_wmf: + content += formula_list[wmf_idx] + "\n" + wmf_idx += 1 + elif is_img: + #![](media/image3.png){width="3.1251607611548557in" + # height="3.694634733158355in"} + img_idx += 1 + content += "【图片" + str(img_idx) + "】\n" + else: + content += line.strip() + "\n" # 删除临时文件 output_file os.remove(temp_markdown) diff --git a/dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc b/dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc index bcca2d20..b8e44d42 100644 Binary files a/dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc and b/dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc differ