'commit'

4 weeks ago · 7fc19bd56e
parent 41655b2b91
commit 7fc19bd56e
2 changed files with 23 additions and 17 deletions
--- a/dsRag/Util/DocxUtil.py
+++ b/dsRag/Util/DocxUtil.py
@ -38,23 +38,29 @@ def get_docx_content_by_pandoc(docx_file):
    img_idx = 0  # 图片索引
    with open(temp_markdown, 'r', encoding='utf-8') as f:
        for line in f:
-            if line.strip():
+            line = line.strip()
-                # 改进后的正则表达式，匹配更多格式的MathType公式
+            if not line:
-                """
+                continue
-                ![](media/image1.wmf)
+            # 跳过图片高度描述行
-                问题2 氢气与氧气燃烧的方程式
+            if line.startswith('height=') and line.endswith('in"}'):
-                .![](media/image2.wmf).
+                continue
-                问题3 我是一个图片
+            # 使用find()方法安全地检查图片模式
-                ![](media/image3.png){width="3.1251607611548557in" height="3.694634733158355in"}
+            is_wmf = line.find("![](media/image") >= 0 and line.find(".wmf") > 0
-                """
+            is_img = line.find("![](media/image") >= 0 and (
-                if line.index("![](media/image") >= 0 and line.index(".wmf") > 0:
+                    line.find(".png") > 0 or
-                    content = content + formula_list[wmf_idx] + "\n"
+                    line.find(".jpg") > 0 or
-                    wmf_idx = wmf_idx + 1
+                    line.find(".jpeg") > 0
-                elif line.index("![](media/image") >= 0 and (
+            )
-                        line.index(".png") > 0 or line.index(".jpg") > 0 or line.index(".jpeg") > 0):
+            if is_wmf:
-                    content = content + "【图片" + str(img_idx) + "】\n"
+                content += formula_list[wmf_idx] + "\n"
                wmf_idx += 1
            elif is_img:
                #![](media/image3.png){width="3.1251607611548557in"
                # height="3.694634733158355in"}
                img_idx += 1
                content += "【图片" + str(img_idx) + "】\n"
            else:
-                    content = content + line.strip() + "\n"
+                content += line.strip() + "\n"
    # 删除临时文件 output_file
    os.remove(temp_markdown)
--- a/dsRag/Util/pycache/DocxUtil.cpython-310.pyc
+++ b/dsRag/Util/pycache/DocxUtil.cpython-310.pyc