dsProject/dsRagAnything/Tools/T1_Office_document_test.py

#!/usr/bin/env python3
"""
Office文档解析测试脚本 - RAG-Anything项目

本脚本演示如何使用MinerU解析各种Office文档格式，包括：
- DOC/DOCX (Word文档)
- PPT/PPTX (PowerPoint演示文稿)
- XLS/XLSX (Excel电子表格)

要求：
1. 系统已安装LibreOffice
2. 已安装RAG-Anything包

使用方法：
    python office_document_test.py --file 办公文档路径.docx
"""

import argparse
import sys
from pathlib import Path
from raganything import RAGAnything, RAGAnythingConfig


def check_libreoffice_installation():
    """
    检查LibreOffice是否已安装并可用
    
    返回:
        bool: 如果LibreOffice可用返回True，否则返回False
    """

    """
    import subprocess

    # 尝试不同的LibreOffice命令名称
    for cmd in ["libreoffice", "soffice"]:
        try:
            result = subprocess.run(
                [cmd, "--version"], 
                capture_output=True, 
                check=True, 
                timeout=10
            )
            print(f"✅ 找到LibreOffice: {result.stdout.decode().strip()}")
            return True
        except (
            subprocess.CalledProcessError,
            FileNotFoundError,
            subprocess.TimeoutExpired,
        ):
            continue

    # 如果未找到LibreOffice，显示安装指南
    print("❌ 未找到LibreOffice. 请安装LibreOffice:")
    print("  - Windows: 从 https://www.libreoffice.org/download/download/ 下载")
    print("  - macOS: brew install --cask libreoffice")
    print("  - Ubuntu/Debian: sudo apt-get install libreoffice")
    print("  - CentOS/RHEL: sudo yum install libreoffice")
    return False
    """
    return True


def test_office_document_parsing(file_path: str):
    """
    测试Office文档解析功能
    
    参数:
        file_path (str): 要测试的Office文档路径
        
    返回:
        bool: 解析成功返回True，否则返回False
    """
    # 在test_office_document_parsing函数中添加
    import os
    os.environ["LIBREOFFICE_PATH"] = "C:\\Program Files\\LibreOffice\\program"
    print(f"🧪 测试Office文档解析: {file_path}")

    # 检查文件是否存在且是支持的Office格式
    file_path = Path(file_path)
    if not file_path.exists():
        print(f"❌ 文件不存在: {file_path}")
        return False

    # 支持的文档扩展名列表
    supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
    if file_path.suffix.lower() not in supported_extensions:
        print(f"❌ 不支持的文档格式: {file_path.suffix}")
        print(f"   支持的格式: {', '.join(supported_extensions)}")
        return False

    # 显示文档基本信息
    print(f"📄 文档格式: {file_path.suffix.upper()}")
    print(f"📏 文件大小: {file_path.stat().st_size / 1024:.1f} KB")

    # 初始化RAGAnything(仅用于解析功能)
    from raganything.config import RAGAnythingConfig
    config = RAGAnythingConfig(working_dir="./temp_parsing_test")
    rag = RAGAnything(config=config)

    try:
        # 添加MinerU安装检查
        from raganything.mineru_parser import MineruParser
        if not MineruParser.check_installation():
            print("❌ MinerU未正确安装")
            return False
            
        # 确保output_dir已定义
        output_dir = "./test_output"
        Path(output_dir).mkdir(exist_ok=True)
        
        # 添加PDF转换检查
        pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
        print(f"PDF转换路径: {pdf_path}")
        if pdf_path.exists():
            print(f"✅ PDF已生成，大小: {pdf_path.stat().st_size}字节")
        else:
            print("❌ PDF转换失败")
            
        # 使用MinerU测试文档解析
        print("\n🔄 使用MinerU测试文档解析...")
        
        # 使用绝对路径确保输出目录位置明确
        output_dir = Path("d:/dsWork/dsProject/dsRagAnything/Tools/test_output").absolute()
        output_dir.mkdir(exist_ok=True)
        
        content_list, md_content = rag.parse_document(
            file_path=str(file_path),
            output_dir=str(output_dir),
            parse_method="auto",
            display_stats=True,
        )
        
        # 检查输出目录内容
        print(f"\n📂 输出目录内容({output_dir}):")
        for f in output_dir.glob("*"):
            print(f"   - {f.name}")
        
        print("✅ 解析成功!")
        print(f"   📊 内容块数量: {len(content_list)}")
        print(f"   📝 Markdown长度: {len(md_content)} 字符")

        # 分析内容类型分布
        content_types = {}
        for item in content_list:
            if isinstance(item, dict):
                content_type = item.get("type", "unknown")
                content_types[content_type] = content_types.get(content_type, 0) + 1

        if content_types:
            print("   📋 内容类型分布:")
            for content_type, count in sorted(content_types.items()):
                print(f"      • {content_type}: {count}")

        # 显示解析内容预览
        if md_content.strip():
            print("\n📄 解析内容预览(前500字符):")
            preview = md_content.strip()[:500]
            print(f"   {preview}{'...' if len(md_content) > 500 else ''}")

        # 显示文本块示例
        text_items = [
            item
            for item in content_list
            if isinstance(item, dict) and item.get("type") == "text"
        ]
        if text_items:
            print("\n📝 文本块示例:")
            for i, item in enumerate(text_items[:3], 1):
                text_content = item.get("text", "")
                if text_content.strip():
                    preview = text_content.strip()[:200]
                    print(
                        f"   {i}. {preview}{'...' if len(text_content) > 200 else ''}"
                    )

        # 检查图片内容
        image_items = [
            item
            for item in content_list
            if isinstance(item, dict) and item.get("type") == "image"
        ]
        if image_items:
            print(f"\n🖼️  找到 {len(image_items)} 张图片:")
            for i, item in enumerate(image_items, 1):
                print(f"   {i}. 图片路径: {item.get('img_path', 'N/A')}")

        # 检查表格内容
        table_items = [
            item
            for item in content_list
            if isinstance(item, dict) and item.get("type") == "table"
        ]
        if table_items:
            print(f"\n📊 找到 {len(table_items)} 个表格:")
            for i, item in enumerate(table_items, 1):
                table_body = item.get("table_body", "")
                row_count = len(table_body.split("\n"))
                print(f"   {i}. 包含 {row_count} 行的表格")

        print("\n🎉 Office文档解析测试成功完成!")
        print("📁 输出文件保存到: ./test_output")
        return True

        # 在test_office_document_parsing函数中添加
        pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
        print(f"PDF文件内容预览(前100字符): {pdf_path.read_text()[:100]}")

    except Exception as e:
        print(f"\n❌ Office文档解析失败: {str(e)}")
        import traceback

        print(f"   完整错误: {traceback.format_exc()}")
        return False


def main():
    """
    主函数
    
    处理命令行参数并执行测试
    """
    # 固定文档路径
    file_path = r"../Txt/小学数学教学中的若干问题_MATH_1.docx"
    
    # 检查LibreOffice安装
    print("🔧 检查LibreOffice安装状态...")
    if not check_libreoffice_installation():
        return 1

    # 运行解析测试
    try:
        success = test_office_document_parsing(file_path)
        return 0 if success else 1
    except KeyboardInterrupt:
        print("\n⏹️ 测试被用户中断")
        return 1
    except Exception as e:
        print(f"\n❌ 发生意外错误: {str(e)}")
        return 1


if __name__ == "__main__":
    sys.exit(main())
-												'commit'

											
										
										
											3 weeks ago
+								#!/usr/bin/env python3
 								"""
-												'commit'

											
										
										
											3 weeks ago
+								Office文档解析测试脚本 - RAG-Anything项目
-												'commit'

											
										
										
											3 weeks ago
-												'commit'

											
										
										
											3 weeks ago
+								本脚本演示如何使用MinerU解析各种Office文档格式，包括：
 								- DOC/DOCX (Word文档)
 								- PPT/PPTX (PowerPoint演示文稿)
 								- XLS/XLSX (Excel电子表格)
-												'commit'

											
										
										
											3 weeks ago
-												'commit'

											
										
										
											3 weeks ago
+								要求：
 . 系统已安装LibreOffice
 . 已安装RAG-Anything包
-												'commit'

											
										
										
											3 weeks ago
-												'commit'

											
										
										
											3 weeks ago
+								使用方法：
 								    python office_document_test.py --file 办公文档路径.docx
-												'commit'

											
										
										
											3 weeks ago
+								"""
 								import argparse
 								import sys
 								from pathlib import Path
-												'commit'

											
										
										
											3 weeks ago
+								from raganything import RAGAnything, RAGAnythingConfig
-												'commit'

											
										
										
											3 weeks ago
 								def check_libreoffice_installation():
-												'commit'

											
										
										
											3 weeks ago
+								    """
 								    检查LibreOffice是否已安装并可用
 								    返回:
 								        bool: 如果LibreOffice可用返回True，否则返回False
 								    """
-												'commit'

											
										
										
											3 weeks ago
 								    """
-												'commit'

											
										
										
											3 weeks ago
+								    import subprocess
-												'commit'

											
										
										
											3 weeks ago
+								    # 尝试不同的LibreOffice命令名称
-												'commit'

											
										
										
											3 weeks ago
+								    for cmd in ["libreoffice", "soffice"]:
 								        try:
 								            result = subprocess.run(
-												'commit'

											
										
										
											3 weeks ago
+								                [cmd, "--version"],
 								                capture_output=True,
 								                check=True,
 								                timeout=10
-												'commit'

											
										
										
											3 weeks ago
+								            )
-												'commit'

											
										
										
											3 weeks ago
+								            print(f"✅ 找到LibreOffice: {result.stdout.decode().strip()}")
-												'commit'

											
										
										
											3 weeks ago
+								            return True
 								        except (
 								            subprocess.CalledProcessError,
 								            FileNotFoundError,
 								            subprocess.TimeoutExpired,
 								        ):
 								            continue
-												'commit'

											
										
										
											3 weeks ago
+								    # 如果未找到LibreOffice，显示安装指南
 								    print("❌ 未找到LibreOffice. 请安装LibreOffice:")
 								    print("  - Windows: 从 https://www.libreoffice.org/download/download/ 下载")
-												'commit'

											
										
										
											3 weeks ago
+								    print("  - macOS: brew install --cask libreoffice")
 								    print("  - Ubuntu/Debian: sudo apt-get install libreoffice")
 								    print("  - CentOS/RHEL: sudo yum install libreoffice")
 								    return False
-												'commit'

											
										
										
											3 weeks ago
+								    """
 								    return True
-												'commit'

											
										
										
											3 weeks ago
 								def test_office_document_parsing(file_path: str):
-												'commit'

											
										
										
											3 weeks ago
+								    """
 								    测试Office文档解析功能
 								    参数:
 								        file_path (str): 要测试的Office文档路径
 								    返回:
 								        bool: 解析成功返回True，否则返回False
 								    """
-												'commit'

											
										
										
											3 weeks ago
+								    # 在test_office_document_parsing函数中添加
 								    import os
 								    os.environ["LIBREOFFICE_PATH"] = "C:\\Program Files\\LibreOffice\\program"
-												'commit'

											
										
										
											3 weeks ago
+								    print(f"🧪 测试Office文档解析: {file_path}")
 								    # 检查文件是否存在且是支持的Office格式
-												'commit'

											
										
										
											3 weeks ago
+								    file_path = Path(file_path)
 								    if not file_path.exists():
-												'commit'

											
										
										
											3 weeks ago
+								        print(f"❌ 文件不存在: {file_path}")
-												'commit'

											
										
										
											3 weeks ago
+								        return False
-												'commit'

											
										
										
											3 weeks ago
+								    # 支持的文档扩展名列表
-												'commit'

											
										
										
											3 weeks ago
+								    supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
 								    if file_path.suffix.lower() not in supported_extensions:
-												'commit'

											
										
										
											3 weeks ago
+								        print(f"❌ 不支持的文档格式: {file_path.suffix}")
 								        print(f"   支持的格式: {', '.join(supported_extensions)}")
-												'commit'

											
										
										
											3 weeks ago
+								        return False
-												'commit'

											
										
										
											3 weeks ago
+								    # 显示文档基本信息
 								    print(f"📄 文档格式: {file_path.suffix.upper()}")
 								    print(f"📏 文件大小: {file_path.stat().st_size / 1024:.1f} KB")
-												'commit'

											
										
										
											3 weeks ago
-												'commit'

											
										
										
											3 weeks ago
+								    # 初始化RAGAnything(仅用于解析功能)
 								    from raganything.config import RAGAnythingConfig
 								    config = RAGAnythingConfig(working_dir="./temp_parsing_test")
 								    rag = RAGAnything(config=config)
-												'commit'

											
										
										
											3 weeks ago
 								    try:
-												'commit'

											
										
										
											3 weeks ago
+								        # 添加MinerU安装检查
 								        from raganything.mineru_parser import MineruParser
 								        if not MineruParser.check_installation():
 								            print("❌ MinerU未正确安装")
 								            return False
 								        # 确保output_dir已定义
 								        output_dir = "./test_output"
 								        Path(output_dir).mkdir(exist_ok=True)
 								        # 添加PDF转换检查
 								        pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
 								        print(f"PDF转换路径: {pdf_path}")
 								        if pdf_path.exists():
 								            print(f"✅ PDF已生成，大小: {pdf_path.stat().st_size}字节")
 								        else:
 								            print("❌ PDF转换失败")
-												'commit'

											
										
										
											3 weeks ago
+								        # 使用MinerU测试文档解析
 								        print("\n🔄 使用MinerU测试文档解析...")
-												'commit'

											
										
										
											3 weeks ago
 								        # 使用绝对路径确保输出目录位置明确
 								        output_dir = Path("d:/dsWork/dsProject/dsRagAnything/Tools/test_output").absolute()
 								        output_dir.mkdir(exist_ok=True)
-												'commit'

											
										
										
											3 weeks ago
+								        content_list, md_content = rag.parse_document(
 								            file_path=str(file_path),
-												'commit'

											
										
										
											3 weeks ago
+								            output_dir=str(output_dir),
-												'commit'

											
										
										
											3 weeks ago
+								            parse_method="auto",
 								            display_stats=True,
 								        )
-												'commit'

											
										
										
											3 weeks ago
 								        # 检查输出目录内容
 								        print(f"\n📂 输出目录内容({output_dir}):")
 								        for f in output_dir.glob("*"):
 								            print(f"   - {f.name}")
-												'commit'

											
										
										
											3 weeks ago
+								        print("✅ 解析成功!")
 								        print(f"   📊 内容块数量: {len(content_list)}")
 								        print(f"   📝 Markdown长度: {len(md_content)} 字符")
-												'commit'

											
										
										
											3 weeks ago
-												'commit'

											
										
										
											3 weeks ago
+								        # 分析内容类型分布
-												'commit'

											
										
										
											3 weeks ago
+								        content_types = {}
 								        for item in content_list:
 								            if isinstance(item, dict):
 								                content_type = item.get("type", "unknown")
 								                content_types[content_type] = content_types.get(content_type, 0) + 1
 								        if content_types:
-												'commit'

											
										
										
											3 weeks ago
+								            print("   📋 内容类型分布:")
-												'commit'

											
										
										
											3 weeks ago
+								            for content_type, count in sorted(content_types.items()):
 								                print(f"      • {content_type}: {count}")
-												'commit'

											
										
										
											3 weeks ago
+								        # 显示解析内容预览
-												'commit'

											
										
										
											3 weeks ago
+								        if md_content.strip():
-												'commit'

											
										
										
											3 weeks ago
+								            print("\n📄 解析内容预览(前500字符):")
-												'commit'

											
										
										
											3 weeks ago
+								            preview = md_content.strip()[:500]
 								            print(f"   {preview}{'...' if len(md_content) > 500 else ''}")
-												'commit'

											
										
										
											3 weeks ago
+								        # 显示文本块示例
-												'commit'

											
										
										
											3 weeks ago
+								        text_items = [
 								            item
 								            for item in content_list
 								            if isinstance(item, dict) and item.get("type") == "text"
 								        ]
 								        if text_items:
-												'commit'

											
										
										
											3 weeks ago
+								            print("\n📝 文本块示例:")
-												'commit'

											
										
										
											3 weeks ago
+								            for i, item in enumerate(text_items[:3], 1):
 								                text_content = item.get("text", "")
 								                if text_content.strip():
 								                    preview = text_content.strip()[:200]
 								                    print(
 								                        f"   {i}. {preview}{'...' if len(text_content) > 200 else ''}"
 								                    )
-												'commit'

											
										
										
											3 weeks ago
+								        # 检查图片内容
-												'commit'

											
										
										
											3 weeks ago
+								        image_items = [
 								            item
 								            for item in content_list
 								            if isinstance(item, dict) and item.get("type") == "image"
 								        ]
 								        if image_items:
-												'commit'

											
										
										
											3 weeks ago
+								            print(f"\n🖼️  找到 {len(image_items)} 张图片:")
-												'commit'

											
										
										
											3 weeks ago
+								            for i, item in enumerate(image_items, 1):
-												'commit'

											
										
										
											3 weeks ago
+								                print(f"   {i}. 图片路径: {item.get('img_path', 'N/A')}")
-												'commit'

											
										
										
											3 weeks ago
-												'commit'

											
										
										
											3 weeks ago
+								        # 检查表格内容
-												'commit'

											
										
										
											3 weeks ago
+								        table_items = [
 								            item
 								            for item in content_list
 								            if isinstance(item, dict) and item.get("type") == "table"
 								        ]
 								        if table_items:
-												'commit'

											
										
										
											3 weeks ago
+								            print(f"\n📊 找到 {len(table_items)} 个表格:")
-												'commit'

											
										
										
											3 weeks ago
+								            for i, item in enumerate(table_items, 1):
 								                table_body = item.get("table_body", "")
 								                row_count = len(table_body.split("\n"))
-												'commit'

											
										
										
											3 weeks ago
+								                print(f"   {i}. 包含 {row_count} 行的表格")
-												'commit'

											
										
										
											3 weeks ago
-												'commit'

											
										
										
											3 weeks ago
+								        print("\n🎉 Office文档解析测试成功完成!")
 								        print("📁 输出文件保存到: ./test_output")
-												'commit'

											
										
										
											3 weeks ago
+								        return True
-												'commit'

											
										
										
											3 weeks ago
+								        # 在test_office_document_parsing函数中添加
 								        pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
 								        print(f"PDF文件内容预览(前100字符): {pdf_path.read_text()[:100]}")
-												'commit'

											
										
										
											3 weeks ago
+								    except Exception as e:
-												'commit'

											
										
										
											3 weeks ago
+								        print(f"\n❌ Office文档解析失败: {str(e)}")
-												'commit'

											
										
										
											3 weeks ago
+								        import traceback
-												'commit'

											
										
										
											3 weeks ago
+								        print(f"   完整错误: {traceback.format_exc()}")
-												'commit'

											
										
										
											3 weeks ago
+								        return False
 								def main():
-												'commit'

											
										
										
											3 weeks ago
+								    """
 								    主函数
 								    处理命令行参数并执行测试
 								    """
 								    # 固定文档路径
 								    file_path = r"../Txt/小学数学教学中的若干问题_MATH_1.docx"
 								    # 检查LibreOffice安装
 								    print("🔧 检查LibreOffice安装状态...")
-												'commit'

											
										
										
											3 weeks ago
+								    if not check_libreoffice_installation():
 								        return 1
-												'commit'

											
										
										
											3 weeks ago
+								    # 运行解析测试
-												'commit'

											
										
										
											3 weeks ago
+								    try:
-												'commit'

											
										
										
											3 weeks ago
+								        success = test_office_document_parsing(file_path)
-												'commit'

											
										
										
											3 weeks ago
+								        return 0 if success else 1
 								    except KeyboardInterrupt:
-												'commit'

											
										
										
											3 weeks ago
+								        print("\n⏹️ 测试被用户中断")
-												'commit'

											
										
										
											3 weeks ago
+								        return 1
 								    except Exception as e:
-												'commit'

											
										
										
											3 weeks ago
+								        print(f"\n❌ 发生意外错误: {str(e)}")
-												'commit'

											
										
										
											3 weeks ago
+								        return 1
 								if __name__ == "__main__":
 								    sys.exit(main())
-												'commit'

											
										
										
											3 weeks ago