dsProject/dsRagAnything/Tools/T1_Office_document_test.py

#!/usr/bin/env python3
"""
Office Document Parsing Test Script for RAG-Anything

This script demonstrates how to parse various Office document formats
using MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files.

Requirements:
- LibreOffice installed on the system
- RAG-Anything package

Usage:
    python office_document_test.py --file path/to/office/document.docx
"""

import argparse
import sys
from pathlib import Path
from raganything import RAGAnything


def check_libreoffice_installation():
    """Check if LibreOffice is installed and available"""
    import subprocess

    for cmd in ["libreoffice", "soffice"]:
        try:
            result = subprocess.run(
                [cmd, "--version"], capture_output=True, check=True, timeout=10
            )
            print(f"✅ LibreOffice found: {result.stdout.decode().strip()}")
            return True
        except (
            subprocess.CalledProcessError,
            FileNotFoundError,
            subprocess.TimeoutExpired,
        ):
            continue

    print("❌ LibreOffice not found. Please install LibreOffice:")
    print("  - Windows: Download from https://www.libreoffice.org/download/download/")
    print("  - macOS: brew install --cask libreoffice")
    print("  - Ubuntu/Debian: sudo apt-get install libreoffice")
    print("  - CentOS/RHEL: sudo yum install libreoffice")
    return False


def test_office_document_parsing(file_path: str):
    """Test Office document parsing with MinerU"""

    print(f"🧪 Testing Office document parsing: {file_path}")

    # Check if file exists and is a supported Office format
    file_path = Path(file_path)
    if not file_path.exists():
        print(f"❌ File does not exist: {file_path}")
        return False

    supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
    if file_path.suffix.lower() not in supported_extensions:
        print(f"❌ Unsupported file format: {file_path.suffix}")
        print(f"   Supported formats: {', '.join(supported_extensions)}")
        return False

    print(f"📄 File format: {file_path.suffix.upper()}")
    print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")

    # Initialize RAGAnything (only for parsing functionality)
    rag = RAGAnything()

    try:
        # Test document parsing with MinerU
        print("\n🔄 Testing document parsing with MinerU...")
        content_list, md_content = rag.parse_document(
            file_path=str(file_path),
            output_dir="./test_output",
            parse_method="auto",
            display_stats=True,
        )

        print("✅ Parsing successful!")
        print(f"   📊 Content blocks: {len(content_list)}")
        print(f"   📝 Markdown length: {len(md_content)} characters")

        # Analyze content types
        content_types = {}
        for item in content_list:
            if isinstance(item, dict):
                content_type = item.get("type", "unknown")
                content_types[content_type] = content_types.get(content_type, 0) + 1

        if content_types:
            print("   📋 Content distribution:")
            for content_type, count in sorted(content_types.items()):
                print(f"      • {content_type}: {count}")

        # Display some parsed content preview
        if md_content.strip():
            print("\n📄 Parsed content preview (first 500 characters):")
            preview = md_content.strip()[:500]
            print(f"   {preview}{'...' if len(md_content) > 500 else ''}")

        # Display some structured content examples
        text_items = [
            item
            for item in content_list
            if isinstance(item, dict) and item.get("type") == "text"
        ]
        if text_items:
            print("\n📝 Sample text blocks:")
            for i, item in enumerate(text_items[:3], 1):
                text_content = item.get("text", "")
                if text_content.strip():
                    preview = text_content.strip()[:200]
                    print(
                        f"   {i}. {preview}{'...' if len(text_content) > 200 else ''}"
                    )

        # Check for images
        image_items = [
            item
            for item in content_list
            if isinstance(item, dict) and item.get("type") == "image"
        ]
        if image_items:
            print(f"\n🖼️  Found {len(image_items)} image(s):")
            for i, item in enumerate(image_items, 1):
                print(f"   {i}. Image path: {item.get('img_path', 'N/A')}")

        # Check for tables
        table_items = [
            item
            for item in content_list
            if isinstance(item, dict) and item.get("type") == "table"
        ]
        if table_items:
            print(f"\n📊 Found {len(table_items)} table(s):")
            for i, item in enumerate(table_items, 1):
                table_body = item.get("table_body", "")
                row_count = len(table_body.split("\n"))
                print(f"   {i}. Table with {row_count} rows")

        print("\n🎉 Office document parsing test completed successfully!")
        print("📁 Output files saved to: ./test_output")
        return True

    except Exception as e:
        print(f"\n❌ Office document parsing failed: {str(e)}")
        import traceback

        print(f"   Full error: {traceback.format_exc()}")
        return False


def main():
    file=r"D:\dsWork\dsProject\dsRagAnything\Txt\小学数学教学中的若干问题_MATH_1.docx"

    # Run the parsing test
    try:
        success = test_office_document_parsing(file)
        return 0 if success else 1
    except KeyboardInterrupt:
        print("\n⏹️ Test interrupted by user")
        return 1
    except Exception as e:
        print(f"\n❌ Unexpected error: {str(e)}")
        return 1


if __name__ == "__main__":
    sys.exit(main())
'commit' 3 weeks ago			`#!/usr/bin/env python3`
			`"""`
'commit' 3 weeks ago			`Office Document Parsing Test Script for RAG-Anything`
'commit' 3 weeks ago
'commit' 3 weeks ago			`This script demonstrates how to parse various Office document formats`
			`using MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files.`
'commit' 3 weeks ago
'commit' 3 weeks ago			`Requirements:`
			`- LibreOffice installed on the system`
			`- RAG-Anything package`
'commit' 3 weeks ago
'commit' 3 weeks ago			`Usage:`
			`python office_document_test.py --file path/to/office/document.docx`
'commit' 3 weeks ago			`"""`

			`import argparse`
			`import sys`
			`from pathlib import Path`
'commit' 3 weeks ago			`from raganything import RAGAnything`
'commit' 3 weeks ago

			`def check_libreoffice_installation():`
'commit' 3 weeks ago			`"""Check if LibreOffice is installed and available"""`
'commit' 3 weeks ago			`import subprocess`

			`for cmd in ["libreoffice", "soffice"]:`
			`try:`
			`result = subprocess.run(`
'commit' 3 weeks ago			`[cmd, "--version"], capture_output=True, check=True, timeout=10`
'commit' 3 weeks ago			`)`
'commit' 3 weeks ago			`print(f"✅ LibreOffice found: {result.stdout.decode().strip()}")`
'commit' 3 weeks ago			`return True`
			`except (`
			`subprocess.CalledProcessError,`
			`FileNotFoundError,`
			`subprocess.TimeoutExpired,`
			`):`
			`continue`

'commit' 3 weeks ago			`print("❌ LibreOffice not found. Please install LibreOffice:")`
			`print(" - Windows: Download from https://www.libreoffice.org/download/download/")`
'commit' 3 weeks ago			`print(" - macOS: brew install --cask libreoffice")`
			`print(" - Ubuntu/Debian: sudo apt-get install libreoffice")`
			`print(" - CentOS/RHEL: sudo yum install libreoffice")`
			`return False`


			`def test_office_document_parsing(file_path: str):`
'commit' 3 weeks ago			`"""Test Office document parsing with MinerU"""`

			`print(f"🧪 Testing Office document parsing: {file_path}")`

			`# Check if file exists and is a supported Office format`
'commit' 3 weeks ago			`file_path = Path(file_path)`
			`if not file_path.exists():`
'commit' 3 weeks ago			`print(f"❌ File does not exist: {file_path}")`
'commit' 3 weeks ago			`return False`

			`supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}`
			`if file_path.suffix.lower() not in supported_extensions:`
'commit' 3 weeks ago			`print(f"❌ Unsupported file format: {file_path.suffix}")`
			`print(f" Supported formats: {', '.join(supported_extensions)}")`
'commit' 3 weeks ago			`return False`

'commit' 3 weeks ago			`print(f"📄 File format: {file_path.suffix.upper()}")`
			`print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")`
'commit' 3 weeks ago
'commit' 3 weeks ago			`# Initialize RAGAnything (only for parsing functionality)`
			`rag = RAGAnything()`
'commit' 3 weeks ago
			`try:`
'commit' 3 weeks ago			`# Test document parsing with MinerU`
			`print("\n🔄 Testing document parsing with MinerU...")`
'commit' 3 weeks ago			`content_list, md_content = rag.parse_document(`
			`file_path=str(file_path),`
'commit' 3 weeks ago			`output_dir="./test_output",`
'commit' 3 weeks ago			`parse_method="auto",`
			`display_stats=True,`
			`)`
'commit' 3 weeks ago
			`print("✅ Parsing successful!")`
			`print(f" 📊 Content blocks: {len(content_list)}")`
			`print(f" 📝 Markdown length: {len(md_content)} characters")`

			`# Analyze content types`
'commit' 3 weeks ago			`content_types = {}`
			`for item in content_list:`
			`if isinstance(item, dict):`
			`content_type = item.get("type", "unknown")`
			`content_types[content_type] = content_types.get(content_type, 0) + 1`

			`if content_types:`
'commit' 3 weeks ago			`print(" 📋 Content distribution:")`
'commit' 3 weeks ago			`for content_type, count in sorted(content_types.items()):`
			`print(f" • {content_type}: {count}")`

'commit' 3 weeks ago			`# Display some parsed content preview`
'commit' 3 weeks ago			`if md_content.strip():`
'commit' 3 weeks ago			`print("\n📄 Parsed content preview (first 500 characters):")`
'commit' 3 weeks ago			`preview = md_content.strip()[:500]`
			`print(f" {preview}{'...' if len(md_content) > 500 else ''}")`

'commit' 3 weeks ago			`# Display some structured content examples`
'commit' 3 weeks ago			`text_items = [`
			`item`
			`for item in content_list`
			`if isinstance(item, dict) and item.get("type") == "text"`
			`]`
			`if text_items:`
'commit' 3 weeks ago			`print("\n📝 Sample text blocks:")`
'commit' 3 weeks ago			`for i, item in enumerate(text_items[:3], 1):`
			`text_content = item.get("text", "")`
			`if text_content.strip():`
			`preview = text_content.strip()[:200]`
			`print(`
			`f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"`
			`)`

'commit' 3 weeks ago			`# Check for images`
'commit' 3 weeks ago			`image_items = [`
			`item`
			`for item in content_list`
			`if isinstance(item, dict) and item.get("type") == "image"`
			`]`
			`if image_items:`
'commit' 3 weeks ago			`print(f"\n🖼️ Found {len(image_items)} image(s):")`
'commit' 3 weeks ago			`for i, item in enumerate(image_items, 1):`
'commit' 3 weeks ago			`print(f" {i}. Image path: {item.get('img_path', 'N/A')}")`
'commit' 3 weeks ago
'commit' 3 weeks ago			`# Check for tables`
'commit' 3 weeks ago			`table_items = [`
			`item`
			`for item in content_list`
			`if isinstance(item, dict) and item.get("type") == "table"`
			`]`
			`if table_items:`
'commit' 3 weeks ago			`print(f"\n📊 Found {len(table_items)} table(s):")`
'commit' 3 weeks ago			`for i, item in enumerate(table_items, 1):`
			`table_body = item.get("table_body", "")`
			`row_count = len(table_body.split("\n"))`
'commit' 3 weeks ago			`print(f" {i}. Table with {row_count} rows")`
'commit' 3 weeks ago
'commit' 3 weeks ago			`print("\n🎉 Office document parsing test completed successfully!")`
			`print("📁 Output files saved to: ./test_output")`
'commit' 3 weeks ago			`return True`

			`except Exception as e:`
'commit' 3 weeks ago			`print(f"\n❌ Office document parsing failed: {str(e)}")`
'commit' 3 weeks ago			`import traceback`

'commit' 3 weeks ago			`print(f" Full error: {traceback.format_exc()}")`
'commit' 3 weeks ago			`return False`


			`def main():`
'commit' 3 weeks ago			`file=r"D:\dsWork\dsProject\dsRagAnything\Txt\小学数学教学中的若干问题_MATH_1.docx"`
'commit' 3 weeks ago
'commit' 3 weeks ago			`# Run the parsing test`
'commit' 3 weeks ago			`try:`
'commit' 3 weeks ago			`success = test_office_document_parsing(file)`
'commit' 3 weeks ago			`return 0 if success else 1`
			`except KeyboardInterrupt:`
'commit' 3 weeks ago			`print("\n⏹️ Test interrupted by user")`
'commit' 3 weeks ago			`return 1`
			`except Exception as e:`
'commit' 3 weeks ago			`print(f"\n❌ Unexpected error: {str(e)}")`
'commit' 3 weeks ago			`return 1`


			`if __name__ == "__main__":`
'commit' 3 weeks ago			`sys.exit(main())`