'commit'

4 weeks ago · 6ab546626d
parent feaa8eaaf8
commit 6ab546626d
7 changed files with 114 additions and 25 deletions
--- a/dsRag/Test/Test_MatchImage.py
+++ b/dsRag/Test/Test_MatchImage.py
@ -1,9 +1,78 @@
-import os
-
+import re
 import docx

-from Util.WordImageUtil import extract_images_from_docx
-
+import os
+import shutil
+import uuid
+import zipfile
+
+from docx import Document
+from docx.oxml.ns import nsmap
+
+
+def extract_images_from_docx(docx_path, output_folder):
+    """
+    从docx提取图片并记录位置
+    :param docx_path: Word文档路径
+    :param output_folder: 图片输出文件夹
+    :return: 包含图片路径和位置的列表
+    """
+    # 创建一个List<String> 记录每个图片的名称和序号
+    image_data = []
+    # 创建临时解压目录
+    temp_dir = os.path.join(output_folder, "temp_docx")
+    os.makedirs(temp_dir, exist_ok=True)
+
+    # 解压docx文件
+    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
+        zip_ref.extractall(temp_dir)
+
+    # 读取主文档关系
+    with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
+        rels_content = rels_file.read()
+
+    # 加载主文档
+    doc = Document(docx_path)
+    img_counter = 1
+
+    # 遍历所有段落
+    for para_idx, paragraph in enumerate(doc.paragraphs):
+        for run_idx, run in enumerate(paragraph.runs):
+            # 检查运行中的图形
+            for element in run._element:
+                if element.tag.endswith('drawing'):
+                    # 提取图片关系ID
+                    blip = element.find('.//a:blip', namespaces=nsmap)
+                    if blip is not None:
+                        embed_id = blip.get('{%s}embed' % nsmap['r'])
+
+                        # 从关系文件中获取图片文件名
+                        rel_entry = f'<Relationship Id="{embed_id}"'
+                        if rel_entry in rels_content:
+                            start = rels_content.find(rel_entry)
+                            target_start = rels_content.find('Target="', start) + 8
+                            target_end = rels_content.find('"', target_start)
+                            image_path = rels_content[target_start:target_end]
+
+                            # 构建图片源路径
+                            src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
+
+                            if os.path.exists(src_path):
+                                # 创建输出文件名
+                                ext = os.path.splitext(src_path)[1]
+                                # 名称为uuid
+                                fileName=uuid.uuid4().hex
+                                img_name = f"{fileName}{ext}"
+                                image_data.append(img_name)
+                                dest_path = os.path.join(output_folder, img_name)
+                                # 复制图片
+                                shutil.copy(src_path, dest_path)
+
+                                img_counter += 1
+
+    # 清理临时目录
+    shutil.rmtree(temp_dir)
+    return image_data

 def read_word_content(docx_path):
    res = ""
@ -73,25 +142,43 @@ def save_to_txt(content, file_path, mode='w'):
        print(f"保存文件{file_path}时出错: {str(e)}")
        return False

+
 if __name__ == "__main__":
    word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx"
-    output_dir="d:\\dsWork\\dsProject\\dsRag\\static\\Test\\"
-    extract_images_from_docx(word_document_path, output_dir)
+    output_dir = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\"
+
+    # 提取图片
+    listImage = extract_images_from_docx(word_document_path, output_dir)
+    # 读取内容
    res = read_word_content(word_document_path)
+    # 分块
    chunks = split_into_blocks(res)
+    saved_count = 0
+
+    # 使用原来的正则表达式
+    pattern = re.compile(r'【图片\d+】')
+    # 初始化图片索引
+    img_idx = 0
+
    for x in chunks:
-        print("===段落开始：===")
        firstLine = x[1].split("\n")[0].strip()
        content = x[1][len(firstLine):].strip()
-        print("firstLine=" + firstLine)
-        print("content=" + content)
-        print("===段落结束：===\n")
-
-        saved_count=0
-        for chunk_num, chunk in chunks:
-            chunk = chunk.strip()  # 确保去除空白字符
-            output_file = os.path.join(output_dir, f"{chunk_num}.txt")
-            if save_to_txt(chunk, output_file, mode='w'):
-                saved_count += 1
-
-        print(f"处理完成，共保存{saved_count}个文件到目录: {output_dir}")
+
+        # 使用finditer查找所有匹配项
+        # 使用闭包函数替换所有匹配项
+        img_idx = [0]  # 使用列表实现可变状态
+        def replacer(match):
+            if img_idx[0] < len(listImage):
+                result = listImage[img_idx[0]]
+                img_idx[0] += 1
+                return result
+            return match.group()
+            
+        content = pattern.sub(replacer, content)
+        # 保存文本文件
+        output_file = os.path.join(output_dir, f"{x[0]}.txt")
+        full_content = f"{firstLine}\n{content}"
+        if save_to_txt(full_content, output_file, mode='w'):
+            saved_count += 1
+
+    print(f"处理完成，共保存{saved_count}个文件到目录: {output_dir}")
--- a/dsRag/Util/WordImageUtil.py
+++ b/dsRag/Util/WordImageUtil.py
@ -14,6 +14,8 @@ def extract_images_from_docx(docx_path, output_folder):
    :param output_folder: 图片输出文件夹
    :return: 包含图片路径和位置的列表
    """
+    # 创建一个List<String> 记录每个图片的名称和序号
+    image_data = []
    # 创建临时解压目录
    temp_dir = os.path.join(output_folder, "temp_docx")
    os.makedirs(temp_dir, exist_ok=True)
@ -30,8 +32,6 @@ def extract_images_from_docx(docx_path, output_folder):
    doc = Document(docx_path)
    img_counter = 1

-    idx = 0
-
    # 遍历所有段落
    for para_idx, paragraph in enumerate(doc.paragraphs):
        for run_idx, run in enumerate(paragraph.runs):
@ -58,8 +58,9 @@ def extract_images_from_docx(docx_path, output_folder):
                                # 创建输出文件名
                                ext = os.path.splitext(src_path)[1]
                                # 名称为uuid
-                                idx = idx + 1
-                                img_name = f"{idx}{ext}"
+                                fileName=uuid.uuid4().hex
+                                img_name = f"{fileName}{ext}"
+                                image_data.append(img_name)
                                dest_path = os.path.join(output_folder, img_name)
                                # 复制图片
                                shutil.copy(src_path, dest_path)
@ -68,3 +69,4 @@ def extract_images_from_docx(docx_path, output_folder):

    # 清理临时目录
    shutil.rmtree(temp_dir)
+    return image_data
--- a/dsRag/Util/pycache/WordImageUtil.cpython-310.pyc
+++ b/dsRag/Util/pycache/WordImageUtil.cpython-310.pyc
--- a/dsRag/static/Test/1.txt
+++ b/dsRag/static/Test/1.txt
@ -2,4 +2,4 @@
 在教学过程中，引导学生构建和理解模型，不仅能提升他们分析和解决问题的能力，还能激发他们发现问题和提出问题的意识。例如，在认识路程模型时，教师可通过生活化情境让学生理解速度的概念及其单位表示。
 模型思想是《义务教育数学课程标准》中强调的核心素养之一，它帮助学生建立从现实世界抽象出数学问题的能力，并通过数学语言进行描述和解释。
 因此，在“综合与实践”类教学内容中，应加强模型的应用训练，以培养学生应用数学知识解决实际问题的能力。
-【图片1】
+81572c98254043d4a475ab7381979a67.png
--- a/dsRag/static/Test/2.txt
+++ b/dsRag/static/Test/2.txt
@ -1,2 +1,2 @@
 我随便写点什么
-【图片2】
+81572c98254043d4a475ab7381979a67.png
--- a/dsRag/static/Test/81572c98254043d4a475ab7381979a67.png
+++ b/dsRag/static/Test/81572c98254043d4a475ab7381979a67.png
--- a/dsRag/static/Test/b536401321764b20b3c000c120ab3c5b.png
+++ b/dsRag/static/Test/b536401321764b20b3c000c120ab3c5b.png