main
HuangHai 4 weeks ago
parent 77234939e1
commit 87e02601c8

@ -1,13 +1,10 @@
import re
import warnings
import docx
import os import os
import re
import shutil import shutil
import uuid import warnings
import zipfile import zipfile
import docx
from docx import Document from docx import Document
from docx.oxml.ns import nsmap from docx.oxml.ns import nsmap

@ -1,72 +0,0 @@
import os
import shutil
import uuid
import zipfile
from docx import Document
from docx.oxml.ns import nsmap
def extract_images_from_docx(docx_path, output_folder):
"""
从docx提取图片并记录位置
:param docx_path: Word文档路径
:param output_folder: 图片输出文件夹
:return: 包含图片路径和位置的列表
"""
# 创建一个List<String> 记录每个图片的名称和序号
image_data = []
# 创建临时解压目录
temp_dir = os.path.join(output_folder, "temp_docx")
os.makedirs(temp_dir, exist_ok=True)
# 解压docx文件
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
# 读取主文档关系
with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
rels_content = rels_file.read()
# 加载主文档
doc = Document(docx_path)
img_counter = 1
# 遍历所有段落
for para_idx, paragraph in enumerate(doc.paragraphs):
for run_idx, run in enumerate(paragraph.runs):
# 检查运行中的图形
for element in run._element:
if element.tag.endswith('drawing'):
# 提取图片关系ID
blip = element.find('.//a:blip', namespaces=nsmap)
if blip is not None:
embed_id = blip.get('{%s}embed' % nsmap['r'])
# 从关系文件中获取图片文件名
rel_entry = f'<Relationship Id="{embed_id}"'
if rel_entry in rels_content:
start = rels_content.find(rel_entry)
target_start = rels_content.find('Target="', start) + 8
target_end = rels_content.find('"', target_start)
image_path = rels_content[target_start:target_end]
# 构建图片源路径
src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
if os.path.exists(src_path):
# 创建输出文件名
ext = os.path.splitext(src_path)[1]
# 名称为uuid
fileName=uuid.uuid4().hex
img_name = f"{fileName}{ext}"
image_data.append(img_name)
dest_path = os.path.join(output_folder, img_name)
# 复制图片
shutil.copy(src_path, dest_path)
img_counter += 1
# 清理临时目录
shutil.rmtree(temp_dir)
return image_data
Loading…
Cancel
Save