#!/usr/bin/env python3 """ Office Document Parsing Test Script for RAG-Anything This script demonstrates how to parse various Office document formats using MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files. Requirements: - LibreOffice installed on the system - RAG-Anything package Usage: python office_document_test.py --file path/to/office/document.docx """ import argparse import sys from pathlib import Path from raganything import RAGAnything def check_libreoffice_installation(): """Check if LibreOffice is installed and available""" import subprocess for cmd in ["libreoffice", "soffice"]: try: result = subprocess.run( [cmd, "--version"], capture_output=True, check=True, timeout=10 ) print(f"โœ… LibreOffice found: {result.stdout.decode().strip()}") return True except ( subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired, ): continue print("โŒ LibreOffice not found. Please install LibreOffice:") print(" - Windows: Download from https://www.libreoffice.org/download/download/") print(" - macOS: brew install --cask libreoffice") print(" - Ubuntu/Debian: sudo apt-get install libreoffice") print(" - CentOS/RHEL: sudo yum install libreoffice") return False def test_office_document_parsing(file_path: str): """Test Office document parsing with MinerU""" print(f"๐Ÿงช Testing Office document parsing: {file_path}") # Check if file exists and is a supported Office format file_path = Path(file_path) if not file_path.exists(): print(f"โŒ File does not exist: {file_path}") return False supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"} if file_path.suffix.lower() not in supported_extensions: print(f"โŒ Unsupported file format: {file_path.suffix}") print(f" Supported formats: {', '.join(supported_extensions)}") return False print(f"๐Ÿ“„ File format: {file_path.suffix.upper()}") print(f"๐Ÿ“ File size: {file_path.stat().st_size / 1024:.1f} KB") # Initialize RAGAnything (only for parsing functionality) rag = RAGAnything(working_dir="./temp_parsing_test") try: # Test document parsing with MinerU print("\n๐Ÿ”„ Testing document parsing with MinerU...") content_list, md_content = rag.parse_document( file_path=str(file_path), output_dir="./test_output", parse_method="auto", display_stats=True, ) print("โœ… Parsing successful!") print(f" ๐Ÿ“Š Content blocks: {len(content_list)}") print(f" ๐Ÿ“ Markdown length: {len(md_content)} characters") # Analyze content types content_types = {} for item in content_list: if isinstance(item, dict): content_type = item.get("type", "unknown") content_types[content_type] = content_types.get(content_type, 0) + 1 if content_types: print(" ๐Ÿ“‹ Content distribution:") for content_type, count in sorted(content_types.items()): print(f" โ€ข {content_type}: {count}") # Display some parsed content preview if md_content.strip(): print("\n๐Ÿ“„ Parsed content preview (first 500 characters):") preview = md_content.strip()[:500] print(f" {preview}{'...' if len(md_content) > 500 else ''}") # Display some structured content examples text_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "text" ] if text_items: print("\n๐Ÿ“ Sample text blocks:") for i, item in enumerate(text_items[:3], 1): text_content = item.get("text", "") if text_content.strip(): preview = text_content.strip()[:200] print( f" {i}. {preview}{'...' if len(text_content) > 200 else ''}" ) # Check for images image_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "image" ] if image_items: print(f"\n๐Ÿ–ผ๏ธ Found {len(image_items)} image(s):") for i, item in enumerate(image_items, 1): print(f" {i}. Image path: {item.get('img_path', 'N/A')}") # Check for tables table_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "table" ] if table_items: print(f"\n๐Ÿ“Š Found {len(table_items)} table(s):") for i, item in enumerate(table_items, 1): table_body = item.get("table_body", "") row_count = len(table_body.split("\n")) print(f" {i}. Table with {row_count} rows") print("\n๐ŸŽ‰ Office document parsing test completed successfully!") print("๐Ÿ“ Output files saved to: ./test_output") return True except Exception as e: print(f"\nโŒ Office document parsing failed: {str(e)}") import traceback print(f" Full error: {traceback.format_exc()}") return False def main(): """Main function""" parser = argparse.ArgumentParser( description="Test Office document parsing with MinerU" ) parser.add_argument( "--file", required=True, help="Path to the Office document to test" ) parser.add_argument( "--check-libreoffice", action="store_true", help="Only check LibreOffice installation", ) args = parser.parse_args() # Check LibreOffice installation print("๐Ÿ”ง Checking LibreOffice installation...") if not check_libreoffice_installation(): return 1 if args.check_libreoffice: print("โœ… LibreOffice installation check passed!") return 0 # Run the parsing test try: success = test_office_document_parsing(args.file) return 0 if success else 1 except KeyboardInterrupt: print("\nโน๏ธ Test interrupted by user") return 1 except Exception as e: print(f"\nโŒ Unexpected error: {str(e)}") return 1 if __name__ == "__main__": sys.exit(main())