From 7fc19bd56e690c6d8d0c5b779eb18f0c0da09af5 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 30 Jun 2025 20:14:00 +0800 Subject: [PATCH] 'commit' --- dsRag/Util/DocxUtil.py | 40 ++++++++++-------- .../Util/__pycache__/DocxUtil.cpython-310.pyc | Bin 1519 -> 1631 bytes 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/dsRag/Util/DocxUtil.py b/dsRag/Util/DocxUtil.py index d4b7b7c7..bb9fcfb5 100644 --- a/dsRag/Util/DocxUtil.py +++ b/dsRag/Util/DocxUtil.py @@ -38,23 +38,29 @@ def get_docx_content_by_pandoc(docx_file): img_idx = 0 # 图片索引 with open(temp_markdown, 'r', encoding='utf-8') as f: for line in f: - if line.strip(): - # 改进后的正则表达式,匹配更多格式的MathType公式 - """ - ![](media/image1.wmf) - 问题2 氢气与氧气燃烧的方程式 - .![](media/image2.wmf). - 问题3 我是一个图片 - ![](media/image3.png){width="3.1251607611548557in" height="3.694634733158355in"} - """ - if line.index("![](media/image") >= 0 and line.index(".wmf") > 0: - content = content + formula_list[wmf_idx] + "\n" - wmf_idx = wmf_idx + 1 - elif line.index("![](media/image") >= 0 and ( - line.index(".png") > 0 or line.index(".jpg") > 0 or line.index(".jpeg") > 0): - content = content + "【图片" + str(img_idx) + "】\n" - else: - content = content + line.strip() + "\n" + line = line.strip() + if not line: + continue + # 跳过图片高度描述行 + if line.startswith('height=') and line.endswith('in"}'): + continue + # 使用find()方法安全地检查图片模式 + is_wmf = line.find("![](media/image") >= 0 and line.find(".wmf") > 0 + is_img = line.find("![](media/image") >= 0 and ( + line.find(".png") > 0 or + line.find(".jpg") > 0 or + line.find(".jpeg") > 0 + ) + if is_wmf: + content += formula_list[wmf_idx] + "\n" + wmf_idx += 1 + elif is_img: + #![](media/image3.png){width="3.1251607611548557in" + # height="3.694634733158355in"} + img_idx += 1 + content += "【图片" + str(img_idx) + "】\n" + else: + content += line.strip() + "\n" # 删除临时文件 output_file os.remove(temp_markdown) diff --git a/dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc b/dsRag/Util/__pycache__/DocxUtil.cpython-310.pyc index bcca2d20732f632e8ac312e51b43efd48f1979f7..b8e44d4234614e2f2913e2eacb4b2f5cdba10525 100644 GIT binary patch delta 511 zcmY+Azfapx5XawL?j^>=#!w(o5Ca>CR$=Lnp+``-JZ_jmSB-RU`wEqHw$UvysmaDJg%VBvf0 z$%*-mr{h6Z)zy~x(#~qSb~l@gA}8+$O{ThXbBJ49Iclz`%C4)t9F%<}dk3tzYQ7}O ztdgFzZb=JMtv{s~&uFor@fBq=+KMxc8Z@m6wbT%AMTVNxGH2o|XX5W#mLxT*MmneE z4K##WnNIQq@@4$=r@e#bkx+uzZYm%mYgZ)s5I_`m%$~xXrz<&kw8TTPwQ6ko?AAJd z7&ouub+nhe9VGqLBfq5sJt&e=9g@haU5=jzxV6hP( qEF!`>y5K?`@S%?QpQHyiYgT0$77*{(AM7=Pc*oqz4cIY1<(o$Ypna48 delta 417 zcmcc5^PZbGpO=@50SG?VCS`DK6f6iW)f?fGnOAzJ-h_T-mHeL0}#~n8yO-G1V~Eut+k9Go%QJGt@HGu%rkAX_i6* z6kU~Io)AP=aUPf_jKUKE@zk?fiYjZEQbffWN_cBnn;C(|ih+y;8pj0G1auJCqREUb zVzNEujEoE*7zxBBj0^Z`m=^*?YM6o@=w0Z;v&kL znU|7Uaf`XQr05n~QEF~}S*j+-WJgxl$