main
HuangHai 4 weeks ago
parent b8642cd9a9
commit 9157880400

@ -1,93 +1,12 @@
word_path = "d:\\dsWork\\dsProject\\dsRag\\Test\\带图的WORD文档.docx"
from docx import Document
from docx.oxml import parse_xml
from docx.oxml.ns import nsmap
import os
import zipfile
import shutil
def extract_images_from_docx(docx_path, output_folder):
"""
从docx提取图片并记录位置
:param docx_path: Word文档路径
:param output_folder: 图片输出文件夹
:return: 包含图片路径和位置的列表
"""
# 创建临时解压目录
temp_dir = os.path.join(output_folder, "temp_docx")
os.makedirs(temp_dir, exist_ok=True)
# 解压docx文件
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
# 读取主文档关系
with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
rels_content = rels_file.read()
# 加载主文档
doc = Document(docx_path)
image_data = []
img_counter = 1
# 遍历所有段落
for para_idx, paragraph in enumerate(doc.paragraphs):
for run_idx, run in enumerate(paragraph.runs):
# 检查运行中的图形
for element in run._element:
if element.tag.endswith('drawing'):
# 提取图片关系ID
blip = element.find('.//a:blip', namespaces=nsmap)
if blip is not None:
embed_id = blip.get('{%s}embed' % nsmap['r'])
# 从关系文件中获取图片文件名
rel_entry = f'<Relationship Id="{embed_id}"'
if rel_entry in rels_content:
start = rels_content.find(rel_entry)
target_start = rels_content.find('Target="', start) + 8
target_end = rels_content.find('"', target_start)
image_path = rels_content[target_start:target_end]
# 构建图片源路径
src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
if os.path.exists(src_path):
# 创建输出文件名
ext = os.path.splitext(src_path)[1]
img_name = f"image_{img_counter}{ext}"
dest_path = os.path.join(output_folder, img_name)
# 复制图片
shutil.copy(src_path, dest_path)
# 记录位置信息
location = {
"paragraph_index": para_idx,
"run_index": run_idx,
"page_number": None, # docx不直接存储页码
"paragraph_text": paragraph.text[:50] + "..." # 截取部分文本
}
image_data.append({
"image_path": dest_path,
"location": location
})
img_counter += 1
# 清理临时目录
shutil.rmtree(temp_dir)
return image_data
from Util.WordImageUtil import extract_images_from_docx
# 使用示例
if __name__ == "__main__":
output_dir = "extracted_images" # 图片输出目录
word_path = "d:\\dsWork\\dsProject\\dsRag\\Test\\带图的WORD文档.docx"
output_dir = "../static/Images" # 图片输出目录
os.makedirs(output_dir, exist_ok=True)
images = extract_images_from_docx(word_path, output_dir)
@ -95,5 +14,4 @@ if __name__ == "__main__":
for img in images:
print(f"图片保存至: {img['image_path']}")
loc = img['location']
print(f"位置信息: 段落 {loc['paragraph_index']}, 运行 {loc['run_index']}")
print(f"上下文: {loc['paragraph_text']}\n")
print(f"位置信息: 段落 {loc['paragraph_index']}")

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 120 KiB

@ -0,0 +1,80 @@
import os
import shutil
import uuid
import zipfile
from docx import Document
from docx.oxml.ns import nsmap
def extract_images_from_docx(docx_path, output_folder):
"""
从docx提取图片并记录位置
:param docx_path: Word文档路径
:param output_folder: 图片输出文件夹
:return: 包含图片路径和位置的列表
"""
# 创建临时解压目录
temp_dir = os.path.join(output_folder, "temp_docx")
os.makedirs(temp_dir, exist_ok=True)
# 解压docx文件
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
# 读取主文档关系
with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
rels_content = rels_file.read()
# 加载主文档
doc = Document(docx_path)
image_data = []
img_counter = 1
# 遍历所有段落
for para_idx, paragraph in enumerate(doc.paragraphs):
for run_idx, run in enumerate(paragraph.runs):
# 检查运行中的图形
for element in run._element:
if element.tag.endswith('drawing'):
# 提取图片关系ID
blip = element.find('.//a:blip', namespaces=nsmap)
if blip is not None:
embed_id = blip.get('{%s}embed' % nsmap['r'])
# 从关系文件中获取图片文件名
rel_entry = f'<Relationship Id="{embed_id}"'
if rel_entry in rels_content:
start = rels_content.find(rel_entry)
target_start = rels_content.find('Target="', start) + 8
target_end = rels_content.find('"', target_start)
image_path = rels_content[target_start:target_end]
# 构建图片源路径
src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
if os.path.exists(src_path):
# 创建输出文件名
ext = os.path.splitext(src_path)[1]
# 名称为uuid
img_name = f"{uuid.uuid4().hex}{ext}"
dest_path = os.path.join(output_folder, img_name)
# 复制图片
shutil.copy(src_path, dest_path)
# 记录位置信息
location = {
"paragraph_index": para_idx
}
image_data.append({
"image_path": dest_path,
"location": location
})
img_counter += 1
# 清理临时目录
shutil.rmtree(temp_dir)
return image_data

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

Loading…
Cancel
Save