diff --git a/dsRag/Test/TestReadWordContent.py b/dsRag/Test/TestReadWordContent.py index 21e415db..74fa6c51 100644 --- a/dsRag/Test/TestReadWordContent.py +++ b/dsRag/Test/TestReadWordContent.py @@ -2,11 +2,13 @@ import os import docx from docx.oxml.ns import nsmap + def read_word_content(docx_path): + idx = 0 """遍历Word文档的每个段落,输出文字或图片标识""" try: doc = docx.Document(docx_path) - + for paragraph in doc.paragraphs: has_image = False # 检查段落中是否有图片 @@ -18,15 +20,17 @@ def read_word_content(docx_path): break if has_image: break - + if has_image: - print("【图片】") + idx = idx + 1 + print("【图片" + str(idx) + "】") elif paragraph.text.strip(): print(paragraph.text.strip()) except Exception as e: print(f"处理Word文档时出错: {str(e)}") + if __name__ == "__main__": # 示例用法 # 请将 'your_document.docx' 替换为你的Word文档路径 @@ -34,4 +38,4 @@ if __name__ == "__main__": if os.path.exists(word_document_path): read_word_content(word_document_path) else: - print(f"文件不存在: {word_document_path}") \ No newline at end of file + print(f"文件不存在: {word_document_path}")