diff --git a/dsRag/Test/TestReadWordImage.py b/dsRag/Test/TestReadWordImage.py new file mode 100644 index 00000000..172eac70 --- /dev/null +++ b/dsRag/Test/TestReadWordImage.py @@ -0,0 +1,31 @@ +import os +import uuid +from docx import Document + +def extract_images_from_word(word_path, output_dir): + """ + 从Word文档中提取图片并保存到指定目录 + :param word_path: Word文档路径 + :param output_dir: 图片输出目录 + """ + doc = Document(word_path) + + # 确保输出目录存在 + os.makedirs(output_dir, exist_ok=True) + + # 获取文档中的所有图片 + for rel in doc.part.rels.values(): + if "image" in rel.target_ref: + img_data = rel.target_part.blob + + # 使用UUID命名图片 + output_path = os.path.join(output_dir, f"{uuid.uuid4()}.jpg") + with open(output_path, "wb") as f: + f.write(img_data) + print(f"图片已保存到: {output_path}") + +if __name__ == "__main__": + word_path = "d:\\dsWork\\dsProject\\dsRag\\Test\\带图的WORD文档.docx" + output_dir = os.path.abspath(os.path.join(os.path.dirname(word_path), "..", "static", "Images")) + + extract_images_from_word(word_path, output_dir) \ No newline at end of file diff --git a/dsRag/Test/带图的WORD文档.docx b/dsRag/Test/带图的WORD文档.docx new file mode 100644 index 00000000..d65d78e4 Binary files /dev/null and b/dsRag/Test/带图的WORD文档.docx differ diff --git a/dsRag/static/Images/0cd8322c-de5d-40a2-8d68-8c9e6d92d0a1.jpg b/dsRag/static/Images/0cd8322c-de5d-40a2-8d68-8c9e6d92d0a1.jpg new file mode 100644 index 00000000..238ad2cb Binary files /dev/null and b/dsRag/static/Images/0cd8322c-de5d-40a2-8d68-8c9e6d92d0a1.jpg differ