'commit'

4 weeks ago · 9157880400
parent b8642cd9a9
commit 9157880400
12 changed files with 84 additions and 86 deletions
--- a/dsRag/Test/TestReadWordImage.py
+++ b/dsRag/Test/TestReadWordImage.py
@ -1,93 +1,12 @@
-word_path = "d:\\dsWork\\dsProject\\dsRag\\Test\\带图的WORD文档.docx"
-
-from docx import Document
-from docx.oxml import parse_xml
-from docx.oxml.ns import nsmap
 import os
-import zipfile
-import shutil
-
-
-def extract_images_from_docx(docx_path, output_folder):
-    """
-    从docx提取图片并记录位置
-    :param docx_path: Word文档路径
-    :param output_folder: 图片输出文件夹
-    :return: 包含图片路径和位置的列表
-    """
-    # 创建临时解压目录
-    temp_dir = os.path.join(output_folder, "temp_docx")
-    os.makedirs(temp_dir, exist_ok=True)
-
-    # 解压docx文件
-    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
-        zip_ref.extractall(temp_dir)
-
-    # 读取主文档关系
-    with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
-        rels_content = rels_file.read()
-
-    # 加载主文档
-    doc = Document(docx_path)
-    image_data = []
-    img_counter = 1
-
-    # 遍历所有段落
-    for para_idx, paragraph in enumerate(doc.paragraphs):
-        for run_idx, run in enumerate(paragraph.runs):
-            # 检查运行中的图形
-            for element in run._element:
-                if element.tag.endswith('drawing'):
-                    # 提取图片关系ID
-                    blip = element.find('.//a:blip', namespaces=nsmap)
-                    if blip is not None:
-                        embed_id = blip.get('{%s}embed' % nsmap['r'])
-
-                        # 从关系文件中获取图片文件名
-                        rel_entry = f'<Relationship Id="{embed_id}"'
-                        if rel_entry in rels_content:
-                            start = rels_content.find(rel_entry)
-                            target_start = rels_content.find('Target="', start) + 8
-                            target_end = rels_content.find('"', target_start)
-                            image_path = rels_content[target_start:target_end]

-                            # 构建图片源路径
-                            src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
-
-                            if os.path.exists(src_path):
-                                # 创建输出文件名
-                                ext = os.path.splitext(src_path)[1]
-                                img_name = f"image_{img_counter}{ext}"
-                                dest_path = os.path.join(output_folder, img_name)
-
-                                # 复制图片
-                                shutil.copy(src_path, dest_path)
-
-                                # 记录位置信息
-                                location = {
-                                    "paragraph_index": para_idx,
-                                    "run_index": run_idx,
-                                    "page_number": None,  # docx不直接存储页码
-                                    "paragraph_text": paragraph.text[:50] + "..."  # 截取部分文本
-                                }
-
-                                image_data.append({
-                                    "image_path": dest_path,
-                                    "location": location
-                                })
-
-                                img_counter += 1
-
-    # 清理临时目录
-    shutil.rmtree(temp_dir)
-    return image_data
+from Util.WordImageUtil import extract_images_from_docx


 # 使用示例
 if __name__ == "__main__":
-
-    output_dir = "extracted_images"  # 图片输出目录
-
+    word_path = "d:\\dsWork\\dsProject\\dsRag\\Test\\带图的WORD文档.docx"
+    output_dir = "../static/Images"  # 图片输出目录
    os.makedirs(output_dir, exist_ok=True)
    images = extract_images_from_docx(word_path, output_dir)

@ -95,5 +14,4 @@ if __name__ == "__main__":
    for img in images:
        print(f"图片保存至: {img['image_path']}")
        loc = img['location']
-        print(f"位置信息: 段落 {loc['paragraph_index']}, 运行 {loc['run_index']}")
-        print(f"上下文: {loc['paragraph_text']}\n")
+        print(f"位置信息: 段落 {loc['paragraph_index']}")
--- a/dsRag/Test/extracted_images/image1.png
+++ b/dsRag/Test/extracted_images/image1.png
--- a/dsRag/Test/extracted_images/image2.png
+++ b/dsRag/Test/extracted_images/image2.png
--- a/dsRag/Test/extracted_images/image_1.png
+++ b/dsRag/Test/extracted_images/image_1.png
--- a/dsRag/Test/extracted_images/image_2.png
+++ b/dsRag/Test/extracted_images/image_2.png
--- a/dsRag/Util/WordImageUtil.py
+++ b/dsRag/Util/WordImageUtil.py
@ -0,0 +1,80 @@
+import os
+import shutil
+import uuid
+import zipfile
+
+from docx import Document
+from docx.oxml.ns import nsmap
+
+
+def extract_images_from_docx(docx_path, output_folder):
+    """
+    从docx提取图片并记录位置
+    :param docx_path: Word文档路径
+    :param output_folder: 图片输出文件夹
+    :return: 包含图片路径和位置的列表
+    """
+    # 创建临时解压目录
+    temp_dir = os.path.join(output_folder, "temp_docx")
+    os.makedirs(temp_dir, exist_ok=True)
+
+    # 解压docx文件
+    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
+        zip_ref.extractall(temp_dir)
+
+    # 读取主文档关系
+    with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
+        rels_content = rels_file.read()
+
+    # 加载主文档
+    doc = Document(docx_path)
+    image_data = []
+    img_counter = 1
+
+    # 遍历所有段落
+    for para_idx, paragraph in enumerate(doc.paragraphs):
+        for run_idx, run in enumerate(paragraph.runs):
+            # 检查运行中的图形
+            for element in run._element:
+                if element.tag.endswith('drawing'):
+                    # 提取图片关系ID
+                    blip = element.find('.//a:blip', namespaces=nsmap)
+                    if blip is not None:
+                        embed_id = blip.get('{%s}embed' % nsmap['r'])
+
+                        # 从关系文件中获取图片文件名
+                        rel_entry = f'<Relationship Id="{embed_id}"'
+                        if rel_entry in rels_content:
+                            start = rels_content.find(rel_entry)
+                            target_start = rels_content.find('Target="', start) + 8
+                            target_end = rels_content.find('"', target_start)
+                            image_path = rels_content[target_start:target_end]
+
+                            # 构建图片源路径
+                            src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
+
+                            if os.path.exists(src_path):
+                                # 创建输出文件名
+                                ext = os.path.splitext(src_path)[1]
+                                # 名称为uuid
+                                img_name = f"{uuid.uuid4().hex}{ext}"
+                                dest_path = os.path.join(output_folder, img_name)
+
+                                # 复制图片
+                                shutil.copy(src_path, dest_path)
+
+                                # 记录位置信息
+                                location = {
+                                    "paragraph_index": para_idx
+                                }
+
+                                image_data.append({
+                                    "image_path": dest_path,
+                                    "location": location
+                                })
+
+                                img_counter += 1
+
+    # 清理临时目录
+    shutil.rmtree(temp_dir)
+    return image_data
--- a/dsRag/Util/pycache/WordImageUtil.cpython-310.pyc
+++ b/dsRag/Util/pycache/WordImageUtil.cpython-310.pyc
--- a/dsRag/requirements.txt
+++ b/dsRag/requirements.txt
--- a/dsRag/static/Images/47d607f9-4172-4436-a5be-bfb830415911.jpg
+++ b/dsRag/static/Images/47d607f9-4172-4436-a5be-bfb830415911.jpg
--- a/dsRag/static/Images/7a520388-ac62-40b7-badb-239c78056eb5.jpg
+++ b/dsRag/static/Images/7a520388-ac62-40b7-badb-239c78056eb5.jpg
--- a/dsRag/static/Images/9c5dd94a-0e19-4df4-bf69-8fb993008df8.jpg
+++ b/dsRag/static/Images/9c5dd94a-0e19-4df4-bf69-8fb993008df8.jpg
--- a/dsRag/static/Images/deaabcca-29ba-4c79-9b7d-2bb54aa4270b.jpg
+++ b/dsRag/static/Images/deaabcca-29ba-4c79-9b7d-2bb54aa4270b.jpg