'commit'

4 weeks ago · b8642cd9a9
parent d37e464a5d
commit b8642cd9a9
13 changed files with 98 additions and 46 deletions
--- a/dsRag/Test/TestReadWordImage.py
+++ b/dsRag/Test/TestReadWordImage.py
@ -1,31 +1,99 @@
-import os
-import uuid
+word_path = "d:\\dsWork\\dsProject\\dsRag\\Test\\带图的WORD文档.docx"
+
 from docx import Document
+from docx.oxml import parse_xml
+from docx.oxml.ns import nsmap
+import os
+import zipfile
+import shutil

-def extract_images_from_word(word_path, output_dir):
+
+def extract_images_from_docx(docx_path, output_folder):
    """
-    从Word文档中提取图片并保存到指定目录
-    :param word_path: Word文档路径
-    :param output_dir: 图片输出目录
+    从docx提取图片并记录位置
+    :param docx_path: Word文档路径
+    :param output_folder: 图片输出文件夹
+    :return: 包含图片路径和位置的列表
    """
-    doc = Document(word_path)
-    
-    # 确保输出目录存在
-    os.makedirs(output_dir, exist_ok=True)
-    
-    # 获取文档中的所有图片
-    for rel in doc.part.rels.values():
-        if "image" in rel.target_ref:
-            img_data = rel.target_part.blob
-            
-            # 使用UUID命名图片
-            output_path = os.path.join(output_dir, f"{uuid.uuid4()}.jpg")
-            with open(output_path, "wb") as f:
-                f.write(img_data)
-            print(f"图片已保存到: {output_path}")
+    # 创建临时解压目录
+    temp_dir = os.path.join(output_folder, "temp_docx")
+    os.makedirs(temp_dir, exist_ok=True)
+
+    # 解压docx文件
+    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
+        zip_ref.extractall(temp_dir)
+
+    # 读取主文档关系
+    with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
+        rels_content = rels_file.read()
+
+    # 加载主文档
+    doc = Document(docx_path)
+    image_data = []
+    img_counter = 1
+
+    # 遍历所有段落
+    for para_idx, paragraph in enumerate(doc.paragraphs):
+        for run_idx, run in enumerate(paragraph.runs):
+            # 检查运行中的图形
+            for element in run._element:
+                if element.tag.endswith('drawing'):
+                    # 提取图片关系ID
+                    blip = element.find('.//a:blip', namespaces=nsmap)
+                    if blip is not None:
+                        embed_id = blip.get('{%s}embed' % nsmap['r'])
+
+                        # 从关系文件中获取图片文件名
+                        rel_entry = f'<Relationship Id="{embed_id}"'
+                        if rel_entry in rels_content:
+                            start = rels_content.find(rel_entry)
+                            target_start = rels_content.find('Target="', start) + 8
+                            target_end = rels_content.find('"', target_start)
+                            image_path = rels_content[target_start:target_end]
+
+                            # 构建图片源路径
+                            src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))

+                            if os.path.exists(src_path):
+                                # 创建输出文件名
+                                ext = os.path.splitext(src_path)[1]
+                                img_name = f"image_{img_counter}{ext}"
+                                dest_path = os.path.join(output_folder, img_name)
+
+                                # 复制图片
+                                shutil.copy(src_path, dest_path)
+
+                                # 记录位置信息
+                                location = {
+                                    "paragraph_index": para_idx,
+                                    "run_index": run_idx,
+                                    "page_number": None,  # docx不直接存储页码
+                                    "paragraph_text": paragraph.text[:50] + "..."  # 截取部分文本
+                                }
+
+                                image_data.append({
+                                    "image_path": dest_path,
+                                    "location": location
+                                })
+
+                                img_counter += 1
+
+    # 清理临时目录
+    shutil.rmtree(temp_dir)
+    return image_data
+
+
+# 使用示例
 if __name__ == "__main__":
-    word_path = "d:\\dsWork\\dsProject\\dsRag\\Test\\带图的WORD文档.docx"
-    output_dir = os.path.abspath(os.path.join(os.path.dirname(word_path), "..", "static", "Images"))
-    
-    extract_images_from_word(word_path, output_dir)
+
+    output_dir = "extracted_images"  # 图片输出目录
+
+    os.makedirs(output_dir, exist_ok=True)
+    images = extract_images_from_docx(word_path, output_dir)
+
+    # 打印结果
+    for img in images:
+        print(f"图片保存至: {img['image_path']}")
+        loc = img['location']
+        print(f"位置信息: 段落 {loc['paragraph_index']}, 运行 {loc['run_index']}")
+        print(f"上下文: {loc['paragraph_text']}\n")
--- a/dsRag/Test/expand_with_synonyms.py
+++ b/dsRag/Test/expand_with_synonyms.py
@ -1,21 +0,0 @@
-from nltk.corpus import wordnet
-import jieba
-
-def expand_with_synonyms(query):
-    words = jieba.lcut(query)
-    expanded = []
-    for word in words:
-        synonyms = set()
-        for syn in wordnet.synsets(word, lang='cmn'):
-            for lemma in syn.lemma_names('cmn'):
-                synonyms.add(lemma)
-        if synonyms:
-            expanded.append(f"({'|'.join(synonyms)})")
-        else:
-            expanded.append(word)
-    return ' '.join(expanded)
-
-original_query = "微积分的基本定理是什么？"
-expanded_query = expand_with_synonyms(original_query)
-print(f"原始查询: {original_query}")
-print(f"扩展后查询: {expanded_query}")
--- a/dsRag/static/Images/0cd8322c-de5d-40a2-8d68-8c9e6d92d0a1.jpg
+++ b/dsRag/static/Images/0cd8322c-de5d-40a2-8d68-8c9e6d92d0a1.jpg
--- a/dsRag/Test/extracted_images/image2.png
+++ b/dsRag/Test/extracted_images/image2.png
--- a/dsRag/Test/extracted_images/image_1.png
+++ b/dsRag/Test/extracted_images/image_1.png
--- a/dsRag/Test/extracted_images/image_2.png
+++ b/dsRag/Test/extracted_images/image_2.png
--- a/dsRag/Test/question_1.txt
+++ b/dsRag/Test/question_1.txt
@ -0,0 +1,4 @@
+问题1 教学建议与意义
+在教学过程中，引导学生构建和理解模型，不仅能提升他们分析和解决问题的能力，还能激发他们发现问题和提出问题的意识。例如，在认识路程模型时，教师可通过生活化情境让学生理解速度的概念及其单位表示。
+模型思想是《义务教育数学课程标准》中强调的核心素养之一，它帮助学生建立从现实世界抽象出数学问题的能力，并通过数学语言进行描述和解释。
+因此，在“综合与实践”类教学内容中，应加强模型的应用训练，以培养学生应用数学知识解决实际问题的能力。
--- a/dsRag/Test/question_2.txt
+++ b/dsRag/Test/question_2.txt
@ -0,0 +1 @@
+问题2 我随便写点什么
--- a/dsRag/Test/带图的WORD文档.docx
+++ b/dsRag/Test/带图的WORD文档.docx
--- a/dsRag/static/Images/47d607f9-4172-4436-a5be-bfb830415911.jpg
+++ b/dsRag/static/Images/47d607f9-4172-4436-a5be-bfb830415911.jpg
--- a/dsRag/static/Images/7a520388-ac62-40b7-badb-239c78056eb5.jpg
+++ b/dsRag/static/Images/7a520388-ac62-40b7-badb-239c78056eb5.jpg
--- a/dsRag/static/Images/9c5dd94a-0e19-4df4-bf69-8fb993008df8.jpg
+++ b/dsRag/static/Images/9c5dd94a-0e19-4df4-bf69-8fb993008df8.jpg
--- a/dsRag/static/Images/deaabcca-29ba-4c79-9b7d-2bb54aa4270b.jpg
+++ b/dsRag/static/Images/deaabcca-29ba-4c79-9b7d-2bb54aa4270b.jpg