From 87e02601c8d56029ed57257184b4be30f0d7e3a9 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 30 Jun 2025 07:53:08 +0800 Subject: [PATCH] 'commit' --- dsRag/ElasticSearch/T2_SplitTxt.py | 9 +-- dsRag/Util/WordImageUtil.py | 72 ------------------ .../__pycache__/WordImageUtil.cpython-310.pyc | Bin 1766 -> 0 bytes 3 files changed, 3 insertions(+), 78 deletions(-) delete mode 100644 dsRag/Util/WordImageUtil.py delete mode 100644 dsRag/Util/__pycache__/WordImageUtil.cpython-310.pyc diff --git a/dsRag/ElasticSearch/T2_SplitTxt.py b/dsRag/ElasticSearch/T2_SplitTxt.py index 2d1c224f..2ecc8660 100644 --- a/dsRag/ElasticSearch/T2_SplitTxt.py +++ b/dsRag/ElasticSearch/T2_SplitTxt.py @@ -1,13 +1,10 @@ -import re -import warnings - -import docx - import os +import re import shutil -import uuid +import warnings import zipfile +import docx from docx import Document from docx.oxml.ns import nsmap diff --git a/dsRag/Util/WordImageUtil.py b/dsRag/Util/WordImageUtil.py deleted file mode 100644 index a887f482..00000000 --- a/dsRag/Util/WordImageUtil.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import shutil -import uuid -import zipfile - -from docx import Document -from docx.oxml.ns import nsmap - - -def extract_images_from_docx(docx_path, output_folder): - """ - 从docx提取图片并记录位置 - :param docx_path: Word文档路径 - :param output_folder: 图片输出文件夹 - :return: 包含图片路径和位置的列表 - """ - # 创建一个List 记录每个图片的名称和序号 - image_data = [] - # 创建临时解压目录 - temp_dir = os.path.join(output_folder, "temp_docx") - os.makedirs(temp_dir, exist_ok=True) - - # 解压docx文件 - with zipfile.ZipFile(docx_path, 'r') as zip_ref: - zip_ref.extractall(temp_dir) - - # 读取主文档关系 - with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file: - rels_content = rels_file.read() - - # 加载主文档 - doc = Document(docx_path) - img_counter = 1 - - # 遍历所有段落 - for para_idx, paragraph in enumerate(doc.paragraphs): - for run_idx, run in enumerate(paragraph.runs): - # 检查运行中的图形 - for element in run._element: - if element.tag.endswith('drawing'): - # 提取图片关系ID - blip = element.find('.//a:blip', namespaces=nsmap) - if blip is not None: - embed_id = blip.get('{%s}embed' % nsmap['r']) - - # 从关系文件中获取图片文件名 - rel_entry = f'`xYNL#B)WLls zFvU5^G>|l*Q9`wA1S78ZO9C#-$xbcPh2AdAK?e}=%S+y@?Z#}!A%#CT5V_zcP#g*+-6fYpR3i-wrEOy*EF z`CrCjvOhr^bRsAHNjjP1T+614-6=7p;wt!Q#nURBoaQtA*&KD|vKcY6i^TLUx=FHG zLHhGzR$+bM8G}T!IsQ`rkeCv4IaEg&wE8y6j0@`*oC_br&dbB<2m(zp)%^piY%aif zaQAB(x3(VcTp#S*80_AA^wpKYo!k4jejn_8_h9eZqrF>&*-GLvH(XGrPU7;7m4&l0 zqYrOfd6@sW|L1RmdzYR!i@QAOa%VjbsK-_ohHm%oeLJ}F7aVzT_x9kYJ41Ws@h*$N z*@DR<)Y60?(1--MuK z$zfiQ_Qt`c==bJU7&MjYF?2PO70TRAKWg`?&E;ixJ&`T4y<)yvq=dY%j z7oPKIuW|IW7r5MyqjbYh7EaJ(i%;-k&p6|e&ET9?I*_-V|rVhxh?Jnfz;!~i=@sxmooJ%Wz~xydggMl zQ{cCmn{0q@){RnWI9}kXw=6a8wgH8RrklRHbls0A)06`Z0+bczB>^NQ%R$PSpU83n z7Nv)Vp0DKvVRH)Hb{=S#}c$hRSlf(d9nG){2vhvdluxJa1`UR-d(jU3;O_ zw&{m%+jFSPUE3HI%4fF99BQ&|S5*}p1%hoAY%7j9iMK= z>a*_OF?0xN)n#bMFU>erBd07N(Hq>k&Fg%ABRubV&(UB?ZrflXrJ zIx%nqTbhAQJcccphc%0g5w&OHI<|;G$QR^3`P*F7F?$OfE>+nmEbMhfUQuKMh}2ab uhuvl=wi&CJ5~W4|6bFkGpCfOoW5&@irrp4MM?I&s4#YqPR?~tIEbU)ev?d4u