diff --git a/dsRag/Test/TestReadWordImage.py b/dsRag/Test/TestReadWordImage.py index 172eac70..ee3389db 100644 --- a/dsRag/Test/TestReadWordImage.py +++ b/dsRag/Test/TestReadWordImage.py @@ -1,31 +1,99 @@ -import os -import uuid +word_path = "d:\\dsWork\\dsProject\\dsRag\\Test\\带图的WORD文档.docx" + from docx import Document +from docx.oxml import parse_xml +from docx.oxml.ns import nsmap +import os +import zipfile +import shutil -def extract_images_from_word(word_path, output_dir): + +def extract_images_from_docx(docx_path, output_folder): """ - 从Word文档中提取图片并保存到指定目录 - :param word_path: Word文档路径 - :param output_dir: 图片输出目录 + 从docx提取图片并记录位置 + :param docx_path: Word文档路径 + :param output_folder: 图片输出文件夹 + :return: 包含图片路径和位置的列表 """ - doc = Document(word_path) - - # 确保输出目录存在 - os.makedirs(output_dir, exist_ok=True) - - # 获取文档中的所有图片 - for rel in doc.part.rels.values(): - if "image" in rel.target_ref: - img_data = rel.target_part.blob - - # 使用UUID命名图片 - output_path = os.path.join(output_dir, f"{uuid.uuid4()}.jpg") - with open(output_path, "wb") as f: - f.write(img_data) - print(f"图片已保存到: {output_path}") + # 创建临时解压目录 + temp_dir = os.path.join(output_folder, "temp_docx") + os.makedirs(temp_dir, exist_ok=True) + + # 解压docx文件 + with zipfile.ZipFile(docx_path, 'r') as zip_ref: + zip_ref.extractall(temp_dir) + + # 读取主文档关系 + with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file: + rels_content = rels_file.read() + + # 加载主文档 + doc = Document(docx_path) + image_data = [] + img_counter = 1 + + # 遍历所有段落 + for para_idx, paragraph in enumerate(doc.paragraphs): + for run_idx, run in enumerate(paragraph.runs): + # 检查运行中的图形 + for element in run._element: + if element.tag.endswith('drawing'): + # 提取图片关系ID + blip = element.find('.//a:blip', namespaces=nsmap) + if blip is not None: + embed_id = blip.get('{%s}embed' % nsmap['r']) + + # 从关系文件中获取图片文件名 + rel_entry = f'