diff --git a/dsRag/ElasticSearch/T2_SplitTxt.py b/dsRag/ElasticSearch/T2_SplitTxt.py index 2d1c224f..2ecc8660 100644 --- a/dsRag/ElasticSearch/T2_SplitTxt.py +++ b/dsRag/ElasticSearch/T2_SplitTxt.py @@ -1,13 +1,10 @@ -import re -import warnings - -import docx - import os +import re import shutil -import uuid +import warnings import zipfile +import docx from docx import Document from docx.oxml.ns import nsmap diff --git a/dsRag/Util/WordImageUtil.py b/dsRag/Util/WordImageUtil.py deleted file mode 100644 index a887f482..00000000 --- a/dsRag/Util/WordImageUtil.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import shutil -import uuid -import zipfile - -from docx import Document -from docx.oxml.ns import nsmap - - -def extract_images_from_docx(docx_path, output_folder): - """ - 从docx提取图片并记录位置 - :param docx_path: Word文档路径 - :param output_folder: 图片输出文件夹 - :return: 包含图片路径和位置的列表 - """ - # 创建一个List 记录每个图片的名称和序号 - image_data = [] - # 创建临时解压目录 - temp_dir = os.path.join(output_folder, "temp_docx") - os.makedirs(temp_dir, exist_ok=True) - - # 解压docx文件 - with zipfile.ZipFile(docx_path, 'r') as zip_ref: - zip_ref.extractall(temp_dir) - - # 读取主文档关系 - with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file: - rels_content = rels_file.read() - - # 加载主文档 - doc = Document(docx_path) - img_counter = 1 - - # 遍历所有段落 - for para_idx, paragraph in enumerate(doc.paragraphs): - for run_idx, run in enumerate(paragraph.runs): - # 检查运行中的图形 - for element in run._element: - if element.tag.endswith('drawing'): - # 提取图片关系ID - blip = element.find('.//a:blip', namespaces=nsmap) - if blip is not None: - embed_id = blip.get('{%s}embed' % nsmap['r']) - - # 从关系文件中获取图片文件名 - rel_entry = f'