diff --git a/dsRagAnything/Doc/文档.txt b/dsRagAnything/Doc/文档.txt index 8773c66c..98fb5cc0 100644 --- a/dsRagAnything/Doc/文档.txt +++ b/dsRagAnything/Doc/文档.txt @@ -8,4 +8,33 @@ conda create -n raganything python=3.10 conda activate raganything # 下一步需要测试的库 -https://github.com/HKUDS/VideoRAG \ No newline at end of file +https://github.com/HKUDS/VideoRAG + +# 添加到PATH +C:\Program Files\LibreOffice\program + +# Office document parsing test (MinerU only) +python examples/office_document_test.py --file path/to/document.docx + +# Check LibreOffice installation +python examples/office_document_test.py --check-libreoffice --file dummy + + +# End-to-end processing +python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY + +# Direct modal processing +python examples/modalprocessors_example.py --api-key YOUR_API_KEY + +# Image format parsing test (MinerU only) +python examples/image_format_test.py --file path/to/image.bmp + +# Text format parsing test (MinerU only) +python examples/text_format_test.py --file path/to/document.md + + +# Check PIL/Pillow installation +python examples/image_format_test.py --check-pillow --file dummy + +# Check ReportLab installation +python examples/text_format_test.py --check-reportlab --file dummy \ No newline at end of file diff --git a/dsRagAnything/Tools/T1_Office_document_test.py b/dsRagAnything/Tools/T1_Office_document_test.py index b46daab0..51b4871a 100644 --- a/dsRagAnything/Tools/T1_Office_document_test.py +++ b/dsRagAnything/Tools/T1_Office_document_test.py @@ -1,16 +1,18 @@ #!/usr/bin/env python3 """ -Office Document Parsing Test Script for RAG-Anything +Office文档解析测试脚本 - RAG-Anything项目 -This script demonstrates how to parse various Office document formats -using MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files. +本脚本演示如何使用MinerU解析各种Office文档格式,包括: +- DOC/DOCX (Word文档) +- PPT/PPTX (PowerPoint演示文稿) +- XLS/XLSX (Excel电子表格) -Requirements: -- LibreOffice installed on the system -- RAG-Anything package +要求: +1. 系统已安装LibreOffice +2. 已安装RAG-Anything包 -Usage: - python office_document_test.py --file path/to/office/document.docx +使用方法: + python office_document_test.py --file 办公文档路径.docx """ import argparse @@ -20,15 +22,24 @@ from raganything import RAGAnything def check_libreoffice_installation(): - """Check if LibreOffice is installed and available""" + """ + 检查LibreOffice是否已安装并可用 + + 返回: + bool: 如果LibreOffice可用返回True,否则返回False + """ import subprocess + # 尝试不同的LibreOffice命令名称 for cmd in ["libreoffice", "soffice"]: try: result = subprocess.run( - [cmd, "--version"], capture_output=True, check=True, timeout=10 + [cmd, "--version"], + capture_output=True, + check=True, + timeout=10 ) - print(f"✅ LibreOffice found: {result.stdout.decode().strip()}") + print(f"✅ 找到LibreOffice: {result.stdout.decode().strip()}") return True except ( subprocess.CalledProcessError, @@ -37,8 +48,9 @@ def check_libreoffice_installation(): ): continue - print("❌ LibreOffice not found. Please install LibreOffice:") - print(" - Windows: Download from https://www.libreoffice.org/download/download/") + # 如果未找到LibreOffice,显示安装指南 + print("❌ 未找到LibreOffice. 请安装LibreOffice:") + print(" - Windows: 从 https://www.libreoffice.org/download/download/ 下载") print(" - macOS: brew install --cask libreoffice") print(" - Ubuntu/Debian: sudo apt-get install libreoffice") print(" - CentOS/RHEL: sudo yum install libreoffice") @@ -46,31 +58,42 @@ def check_libreoffice_installation(): def test_office_document_parsing(file_path: str): - """Test Office document parsing with MinerU""" - - print(f"🧪 Testing Office document parsing: {file_path}") - - # Check if file exists and is a supported Office format + """ + 测试Office文档解析功能 + + 参数: + file_path (str): 要测试的Office文档路径 + + 返回: + bool: 解析成功返回True,否则返回False + """ + print(f"🧪 测试Office文档解析: {file_path}") + + # 检查文件是否存在且是支持的Office格式 file_path = Path(file_path) if not file_path.exists(): - print(f"❌ File does not exist: {file_path}") + print(f"❌ 文件不存在: {file_path}") return False + # 支持的文档扩展名列表 supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"} if file_path.suffix.lower() not in supported_extensions: - print(f"❌ Unsupported file format: {file_path.suffix}") - print(f" Supported formats: {', '.join(supported_extensions)}") + print(f"❌ 不支持的文档格式: {file_path.suffix}") + print(f" 支持的格式: {', '.join(supported_extensions)}") return False - print(f"📄 File format: {file_path.suffix.upper()}") - print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB") + # 显示文档基本信息 + print(f"📄 文档格式: {file_path.suffix.upper()}") + print(f"📏 文件大小: {file_path.stat().st_size / 1024:.1f} KB") - # Initialize RAGAnything (only for parsing functionality) - rag = RAGAnything(working_dir="./temp_parsing_test") + # 初始化RAGAnything(仅用于解析功能) + from raganything.config import RAGAnythingConfig + config = RAGAnythingConfig(working_dir="./temp_parsing_test") + rag = RAGAnything(config=config) try: - # Test document parsing with MinerU - print("\n🔄 Testing document parsing with MinerU...") + # 使用MinerU测试文档解析 + print("\n🔄 使用MinerU测试文档解析...") content_list, md_content = rag.parse_document( file_path=str(file_path), output_dir="./test_output", @@ -78,11 +101,11 @@ def test_office_document_parsing(file_path: str): display_stats=True, ) - print("✅ Parsing successful!") - print(f" 📊 Content blocks: {len(content_list)}") - print(f" 📝 Markdown length: {len(md_content)} characters") + print("✅ 解析成功!") + print(f" 📊 内容块数量: {len(content_list)}") + print(f" 📝 Markdown长度: {len(md_content)} 字符") - # Analyze content types + # 分析内容类型分布 content_types = {} for item in content_list: if isinstance(item, dict): @@ -90,24 +113,24 @@ def test_office_document_parsing(file_path: str): content_types[content_type] = content_types.get(content_type, 0) + 1 if content_types: - print(" 📋 Content distribution:") + print(" 📋 内容类型分布:") for content_type, count in sorted(content_types.items()): print(f" • {content_type}: {count}") - # Display some parsed content preview + # 显示解析内容预览 if md_content.strip(): - print("\n📄 Parsed content preview (first 500 characters):") + print("\n📄 解析内容预览(前500字符):") preview = md_content.strip()[:500] print(f" {preview}{'...' if len(md_content) > 500 else ''}") - # Display some structured content examples + # 显示文本块示例 text_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "text" ] if text_items: - print("\n📝 Sample text blocks:") + print("\n📝 文本块示例:") for i, item in enumerate(text_items[:3], 1): text_content = item.get("text", "") if text_content.strip(): @@ -116,76 +139,65 @@ def test_office_document_parsing(file_path: str): f" {i}. {preview}{'...' if len(text_content) > 200 else ''}" ) - # Check for images + # 检查图片内容 image_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "image" ] if image_items: - print(f"\n🖼️ Found {len(image_items)} image(s):") + print(f"\n🖼️ 找到 {len(image_items)} 张图片:") for i, item in enumerate(image_items, 1): - print(f" {i}. Image path: {item.get('img_path', 'N/A')}") + print(f" {i}. 图片路径: {item.get('img_path', 'N/A')}") - # Check for tables + # 检查表格内容 table_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "table" ] if table_items: - print(f"\n📊 Found {len(table_items)} table(s):") + print(f"\n📊 找到 {len(table_items)} 个表格:") for i, item in enumerate(table_items, 1): table_body = item.get("table_body", "") row_count = len(table_body.split("\n")) - print(f" {i}. Table with {row_count} rows") + print(f" {i}. 包含 {row_count} 行的表格") - print("\n🎉 Office document parsing test completed successfully!") - print("📁 Output files saved to: ./test_output") + print("\n🎉 Office文档解析测试成功完成!") + print("📁 输出文件保存到: ./test_output") return True except Exception as e: - print(f"\n❌ Office document parsing failed: {str(e)}") + print(f"\n❌ Office文档解析失败: {str(e)}") import traceback - print(f" Full error: {traceback.format_exc()}") + print(f" 完整错误: {traceback.format_exc()}") return False def main(): - """Main function""" - parser = argparse.ArgumentParser( - description="Test Office document parsing with MinerU" - ) - parser.add_argument( - "--file", required=True, help="Path to the Office document to test" - ) - parser.add_argument( - "--check-libreoffice", - action="store_true", - help="Only check LibreOffice installation", - ) - - args = parser.parse_args() - - # Check LibreOffice installation - print("🔧 Checking LibreOffice installation...") + """ + 主函数 + + 处理命令行参数并执行测试 + """ + # 固定文档路径 + file_path = r"../Txt/小学数学教学中的若干问题_MATH_1.docx" + + # 检查LibreOffice安装 + print("🔧 检查LibreOffice安装状态...") if not check_libreoffice_installation(): return 1 - if args.check_libreoffice: - print("✅ LibreOffice installation check passed!") - return 0 - - # Run the parsing test + # 运行解析测试 try: - success = test_office_document_parsing(args.file) + success = test_office_document_parsing(file_path) return 0 if success else 1 except KeyboardInterrupt: - print("\n⏹️ Test interrupted by user") + print("\n⏹️ 测试被用户中断") return 1 except Exception as e: - print(f"\n❌ Unexpected error: {str(e)}") + print(f"\n❌ 发生意外错误: {str(e)}") return 1 diff --git a/dsRagAnything/Txt/小学数学教学中的若干问题_MATH_1.docx b/dsRagAnything/Txt/小学数学教学中的若干问题_MATH_1.docx new file mode 100644 index 00000000..f52918d8 Binary files /dev/null and b/dsRagAnything/Txt/小学数学教学中的若干问题_MATH_1.docx differ diff --git a/dsRagAnything/raganything/__pycache__/__init__.cpython-310.pyc b/dsRagAnything/raganything/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 00000000..e855b08b Binary files /dev/null and b/dsRagAnything/raganything/__pycache__/__init__.cpython-310.pyc differ diff --git a/dsRagAnything/raganything/__pycache__/batch.cpython-310.pyc b/dsRagAnything/raganything/__pycache__/batch.cpython-310.pyc new file mode 100644 index 00000000..e2f6e5ca Binary files /dev/null and b/dsRagAnything/raganything/__pycache__/batch.cpython-310.pyc differ diff --git a/dsRagAnything/raganything/__pycache__/config.cpython-310.pyc b/dsRagAnything/raganything/__pycache__/config.cpython-310.pyc new file mode 100644 index 00000000..2b765878 Binary files /dev/null and b/dsRagAnything/raganything/__pycache__/config.cpython-310.pyc differ diff --git a/dsRagAnything/raganything/__pycache__/mineru_parser.cpython-310.pyc b/dsRagAnything/raganything/__pycache__/mineru_parser.cpython-310.pyc new file mode 100644 index 00000000..bf4814c9 Binary files /dev/null and b/dsRagAnything/raganything/__pycache__/mineru_parser.cpython-310.pyc differ diff --git a/dsRagAnything/raganything/__pycache__/modalprocessors.cpython-310.pyc b/dsRagAnything/raganything/__pycache__/modalprocessors.cpython-310.pyc new file mode 100644 index 00000000..9fa6c465 Binary files /dev/null and b/dsRagAnything/raganything/__pycache__/modalprocessors.cpython-310.pyc differ diff --git a/dsRagAnything/raganything/__pycache__/processor.cpython-310.pyc b/dsRagAnything/raganything/__pycache__/processor.cpython-310.pyc new file mode 100644 index 00000000..1ac9e4dd Binary files /dev/null and b/dsRagAnything/raganything/__pycache__/processor.cpython-310.pyc differ diff --git a/dsRagAnything/raganything/__pycache__/prompt.cpython-310.pyc b/dsRagAnything/raganything/__pycache__/prompt.cpython-310.pyc new file mode 100644 index 00000000..9cc81158 Binary files /dev/null and b/dsRagAnything/raganything/__pycache__/prompt.cpython-310.pyc differ diff --git a/dsRagAnything/raganything/__pycache__/query.cpython-310.pyc b/dsRagAnything/raganything/__pycache__/query.cpython-310.pyc new file mode 100644 index 00000000..d7953517 Binary files /dev/null and b/dsRagAnything/raganything/__pycache__/query.cpython-310.pyc differ diff --git a/dsRagAnything/raganything/__pycache__/raganything.cpython-310.pyc b/dsRagAnything/raganything/__pycache__/raganything.cpython-310.pyc new file mode 100644 index 00000000..2a2fc74f Binary files /dev/null and b/dsRagAnything/raganything/__pycache__/raganything.cpython-310.pyc differ diff --git a/dsRagAnything/raganything/__pycache__/utils.cpython-310.pyc b/dsRagAnything/raganything/__pycache__/utils.cpython-310.pyc new file mode 100644 index 00000000..fef6ae70 Binary files /dev/null and b/dsRagAnything/raganything/__pycache__/utils.cpython-310.pyc differ