'commit'

3 weeks ago · 87e02601c8
parent 77234939e1
commit 87e02601c8
3 changed files with 3 additions and 78 deletions
--- a/dsRag/ElasticSearch/T2_SplitTxt.py
+++ b/dsRag/ElasticSearch/T2_SplitTxt.py
@ -1,13 +1,10 @@
-import re
-import warnings
-
-import docx
-
 import os
+import re
 import shutil
-import uuid
+import warnings
 import zipfile

+import docx
 from docx import Document
 from docx.oxml.ns import nsmap

--- a/dsRag/Util/WordImageUtil.py
+++ b/dsRag/Util/WordImageUtil.py
@ -1,72 +0,0 @@
-import os
-import shutil
-import uuid
-import zipfile
-
-from docx import Document
-from docx.oxml.ns import nsmap
-
-
-def extract_images_from_docx(docx_path, output_folder):
-    """
-    从docx提取图片并记录位置
-    :param docx_path: Word文档路径
-    :param output_folder: 图片输出文件夹
-    :return: 包含图片路径和位置的列表
-    """
-    # 创建一个List<String> 记录每个图片的名称和序号
-    image_data = []
-    # 创建临时解压目录
-    temp_dir = os.path.join(output_folder, "temp_docx")
-    os.makedirs(temp_dir, exist_ok=True)
-
-    # 解压docx文件
-    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
-        zip_ref.extractall(temp_dir)
-
-    # 读取主文档关系
-    with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
-        rels_content = rels_file.read()
-
-    # 加载主文档
-    doc = Document(docx_path)
-    img_counter = 1
-
-    # 遍历所有段落
-    for para_idx, paragraph in enumerate(doc.paragraphs):
-        for run_idx, run in enumerate(paragraph.runs):
-            # 检查运行中的图形
-            for element in run._element:
-                if element.tag.endswith('drawing'):
-                    # 提取图片关系ID
-                    blip = element.find('.//a:blip', namespaces=nsmap)
-                    if blip is not None:
-                        embed_id = blip.get('{%s}embed' % nsmap['r'])
-
-                        # 从关系文件中获取图片文件名
-                        rel_entry = f'<Relationship Id="{embed_id}"'
-                        if rel_entry in rels_content:
-                            start = rels_content.find(rel_entry)
-                            target_start = rels_content.find('Target="', start) + 8
-                            target_end = rels_content.find('"', target_start)
-                            image_path = rels_content[target_start:target_end]
-
-                            # 构建图片源路径
-                            src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
-
-                            if os.path.exists(src_path):
-                                # 创建输出文件名
-                                ext = os.path.splitext(src_path)[1]
-                                # 名称为uuid
-                                fileName=uuid.uuid4().hex
-                                img_name = f"{fileName}{ext}"
-                                image_data.append(img_name)
-                                dest_path = os.path.join(output_folder, img_name)
-                                # 复制图片
-                                shutil.copy(src_path, dest_path)
-
-                                img_counter += 1
-
-    # 清理临时目录
-    shutil.rmtree(temp_dir)
-    return image_data
--- a/dsRag/Util/pycache/WordImageUtil.cpython-310.pyc
+++ b/dsRag/Util/pycache/WordImageUtil.cpython-310.pyc