From c1dd5684a1c14ee9a6c66832e9c1e90cccae320d Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Fri, 4 Jul 2025 20:06:24 +0800 Subject: [PATCH] 'commit' --- dsRagAnything/Doc/文档.txt | 2 +- .../Tools/T1_Office_document_test.py | 193 ++++++++++++++++++ dsRagAnything/Tools/__init__.py | 0 3 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 dsRagAnything/Tools/T1_Office_document_test.py create mode 100644 dsRagAnything/Tools/__init__.py diff --git a/dsRagAnything/Doc/文档.txt b/dsRagAnything/Doc/文档.txt index d05d2c33..8773c66c 100644 --- a/dsRagAnything/Doc/文档.txt +++ b/dsRagAnything/Doc/文档.txt @@ -7,5 +7,5 @@ conda create -n raganything python=3.10 # 激活虚拟环境 conda activate raganything - +# 下一步需要测试的库 https://github.com/HKUDS/VideoRAG \ No newline at end of file diff --git a/dsRagAnything/Tools/T1_Office_document_test.py b/dsRagAnything/Tools/T1_Office_document_test.py new file mode 100644 index 00000000..b46daab0 --- /dev/null +++ b/dsRagAnything/Tools/T1_Office_document_test.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +""" +Office Document Parsing Test Script for RAG-Anything + +This script demonstrates how to parse various Office document formats +using MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files. + +Requirements: +- LibreOffice installed on the system +- RAG-Anything package + +Usage: + python office_document_test.py --file path/to/office/document.docx +""" + +import argparse +import sys +from pathlib import Path +from raganything import RAGAnything + + +def check_libreoffice_installation(): + """Check if LibreOffice is installed and available""" + import subprocess + + for cmd in ["libreoffice", "soffice"]: + try: + result = subprocess.run( + [cmd, "--version"], capture_output=True, check=True, timeout=10 + ) + print(f"✅ LibreOffice found: {result.stdout.decode().strip()}") + return True + except ( + subprocess.CalledProcessError, + FileNotFoundError, + subprocess.TimeoutExpired, + ): + continue + + print("❌ LibreOffice not found. Please install LibreOffice:") + print(" - Windows: Download from https://www.libreoffice.org/download/download/") + print(" - macOS: brew install --cask libreoffice") + print(" - Ubuntu/Debian: sudo apt-get install libreoffice") + print(" - CentOS/RHEL: sudo yum install libreoffice") + return False + + +def test_office_document_parsing(file_path: str): + """Test Office document parsing with MinerU""" + + print(f"🧪 Testing Office document parsing: {file_path}") + + # Check if file exists and is a supported Office format + file_path = Path(file_path) + if not file_path.exists(): + print(f"❌ File does not exist: {file_path}") + return False + + supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"} + if file_path.suffix.lower() not in supported_extensions: + print(f"❌ Unsupported file format: {file_path.suffix}") + print(f" Supported formats: {', '.join(supported_extensions)}") + return False + + print(f"📄 File format: {file_path.suffix.upper()}") + print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB") + + # Initialize RAGAnything (only for parsing functionality) + rag = RAGAnything(working_dir="./temp_parsing_test") + + try: + # Test document parsing with MinerU + print("\n🔄 Testing document parsing with MinerU...") + content_list, md_content = rag.parse_document( + file_path=str(file_path), + output_dir="./test_output", + parse_method="auto", + display_stats=True, + ) + + print("✅ Parsing successful!") + print(f" 📊 Content blocks: {len(content_list)}") + print(f" 📝 Markdown length: {len(md_content)} characters") + + # Analyze content types + content_types = {} + for item in content_list: + if isinstance(item, dict): + content_type = item.get("type", "unknown") + content_types[content_type] = content_types.get(content_type, 0) + 1 + + if content_types: + print(" 📋 Content distribution:") + for content_type, count in sorted(content_types.items()): + print(f" • {content_type}: {count}") + + # Display some parsed content preview + if md_content.strip(): + print("\n📄 Parsed content preview (first 500 characters):") + preview = md_content.strip()[:500] + print(f" {preview}{'...' if len(md_content) > 500 else ''}") + + # Display some structured content examples + text_items = [ + item + for item in content_list + if isinstance(item, dict) and item.get("type") == "text" + ] + if text_items: + print("\n📝 Sample text blocks:") + for i, item in enumerate(text_items[:3], 1): + text_content = item.get("text", "") + if text_content.strip(): + preview = text_content.strip()[:200] + print( + f" {i}. {preview}{'...' if len(text_content) > 200 else ''}" + ) + + # Check for images + image_items = [ + item + for item in content_list + if isinstance(item, dict) and item.get("type") == "image" + ] + if image_items: + print(f"\n🖼️ Found {len(image_items)} image(s):") + for i, item in enumerate(image_items, 1): + print(f" {i}. Image path: {item.get('img_path', 'N/A')}") + + # Check for tables + table_items = [ + item + for item in content_list + if isinstance(item, dict) and item.get("type") == "table" + ] + if table_items: + print(f"\n📊 Found {len(table_items)} table(s):") + for i, item in enumerate(table_items, 1): + table_body = item.get("table_body", "") + row_count = len(table_body.split("\n")) + print(f" {i}. Table with {row_count} rows") + + print("\n🎉 Office document parsing test completed successfully!") + print("📁 Output files saved to: ./test_output") + return True + + except Exception as e: + print(f"\n❌ Office document parsing failed: {str(e)}") + import traceback + + print(f" Full error: {traceback.format_exc()}") + return False + + +def main(): + """Main function""" + parser = argparse.ArgumentParser( + description="Test Office document parsing with MinerU" + ) + parser.add_argument( + "--file", required=True, help="Path to the Office document to test" + ) + parser.add_argument( + "--check-libreoffice", + action="store_true", + help="Only check LibreOffice installation", + ) + + args = parser.parse_args() + + # Check LibreOffice installation + print("🔧 Checking LibreOffice installation...") + if not check_libreoffice_installation(): + return 1 + + if args.check_libreoffice: + print("✅ LibreOffice installation check passed!") + return 0 + + # Run the parsing test + try: + success = test_office_document_parsing(args.file) + return 0 if success else 1 + except KeyboardInterrupt: + print("\n⏹️ Test interrupted by user") + return 1 + except Exception as e: + print(f"\n❌ Unexpected error: {str(e)}") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/dsRagAnything/Tools/__init__.py b/dsRagAnything/Tools/__init__.py new file mode 100644 index 00000000..e69de29b