parent
77234939e1
commit
87e02601c8
@ -1,72 +0,0 @@
|
|||||||
import os
|
|
||||||
import shutil
|
|
||||||
import uuid
|
|
||||||
import zipfile
|
|
||||||
|
|
||||||
from docx import Document
|
|
||||||
from docx.oxml.ns import nsmap
|
|
||||||
|
|
||||||
|
|
||||||
def extract_images_from_docx(docx_path, output_folder):
|
|
||||||
"""
|
|
||||||
从docx提取图片并记录位置
|
|
||||||
:param docx_path: Word文档路径
|
|
||||||
:param output_folder: 图片输出文件夹
|
|
||||||
:return: 包含图片路径和位置的列表
|
|
||||||
"""
|
|
||||||
# 创建一个List<String> 记录每个图片的名称和序号
|
|
||||||
image_data = []
|
|
||||||
# 创建临时解压目录
|
|
||||||
temp_dir = os.path.join(output_folder, "temp_docx")
|
|
||||||
os.makedirs(temp_dir, exist_ok=True)
|
|
||||||
|
|
||||||
# 解压docx文件
|
|
||||||
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
|
|
||||||
zip_ref.extractall(temp_dir)
|
|
||||||
|
|
||||||
# 读取主文档关系
|
|
||||||
with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
|
|
||||||
rels_content = rels_file.read()
|
|
||||||
|
|
||||||
# 加载主文档
|
|
||||||
doc = Document(docx_path)
|
|
||||||
img_counter = 1
|
|
||||||
|
|
||||||
# 遍历所有段落
|
|
||||||
for para_idx, paragraph in enumerate(doc.paragraphs):
|
|
||||||
for run_idx, run in enumerate(paragraph.runs):
|
|
||||||
# 检查运行中的图形
|
|
||||||
for element in run._element:
|
|
||||||
if element.tag.endswith('drawing'):
|
|
||||||
# 提取图片关系ID
|
|
||||||
blip = element.find('.//a:blip', namespaces=nsmap)
|
|
||||||
if blip is not None:
|
|
||||||
embed_id = blip.get('{%s}embed' % nsmap['r'])
|
|
||||||
|
|
||||||
# 从关系文件中获取图片文件名
|
|
||||||
rel_entry = f'<Relationship Id="{embed_id}"'
|
|
||||||
if rel_entry in rels_content:
|
|
||||||
start = rels_content.find(rel_entry)
|
|
||||||
target_start = rels_content.find('Target="', start) + 8
|
|
||||||
target_end = rels_content.find('"', target_start)
|
|
||||||
image_path = rels_content[target_start:target_end]
|
|
||||||
|
|
||||||
# 构建图片源路径
|
|
||||||
src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
|
|
||||||
|
|
||||||
if os.path.exists(src_path):
|
|
||||||
# 创建输出文件名
|
|
||||||
ext = os.path.splitext(src_path)[1]
|
|
||||||
# 名称为uuid
|
|
||||||
fileName=uuid.uuid4().hex
|
|
||||||
img_name = f"{fileName}{ext}"
|
|
||||||
image_data.append(img_name)
|
|
||||||
dest_path = os.path.join(output_folder, img_name)
|
|
||||||
# 复制图片
|
|
||||||
shutil.copy(src_path, dest_path)
|
|
||||||
|
|
||||||
img_counter += 1
|
|
||||||
|
|
||||||
# 清理临时目录
|
|
||||||
shutil.rmtree(temp_dir)
|
|
||||||
return image_data
|
|
Binary file not shown.
Loading…
Reference in new issue