'commit'

3 weeks ago · 3a9fbad0ff
parent c1dd5684a1
commit 3a9fbad0ff
13 changed files with 113 additions and 72 deletions
--- a/dsRagAnything/Doc/文档.txt
+++ b/dsRagAnything/Doc/文档.txt
@ -8,4 +8,33 @@ conda create -n raganything python=3.10
 conda activate raganything
 # 下一步需要测试的库
-https://github.com/HKUDS/VideoRAG
+https://github.com/HKUDS/VideoRAG
 # 添加到PATH
 C:\Program Files\LibreOffice\program
 # Office document parsing test (MinerU only)
 python examples/office_document_test.py --file path/to/document.docx
 # Check LibreOffice installation
 python examples/office_document_test.py --check-libreoffice --file dummy
 # End-to-end processing
 python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY
 # Direct modal processing
 python examples/modalprocessors_example.py --api-key YOUR_API_KEY
 # Image format parsing test (MinerU only)
 python examples/image_format_test.py --file path/to/image.bmp
 # Text format parsing test (MinerU only)
 python examples/text_format_test.py --file path/to/document.md
 # Check PIL/Pillow installation
 python examples/image_format_test.py --check-pillow --file dummy
 # Check ReportLab installation
 python examples/text_format_test.py --check-reportlab --file dummy
--- a/dsRagAnything/Tools/T1_Office_document_test.py
+++ b/dsRagAnything/Tools/T1_Office_document_test.py
@ -1,16 +1,18 @@
 #!/usr/bin/env python3
 """
-Office Document Parsing Test Script for RAG-Anything
+Office文档解析测试脚本 - RAG-Anything项目
-This script demonstrates how to parse various Office document formats
+本脚本演示如何使用MinerU解析各种Office文档格式，包括：
-using MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files.
+- DOC/DOCX (Word文档)
 - PPT/PPTX (PowerPoint演示文稿)
 - XLS/XLSX (Excel电子表格)
-Requirements:
+要求：
- LibreOffice installed on the system
+1. 系统已安装LibreOffice
- RAG-Anything package
+2. 已安装RAG-Anything包
-Usage:
+使用方法：
-    python office_document_test.py --file path/to/office/document.docx
+    python office_document_test.py --file 办公文档路径.docx
 """
 import argparse
@ -20,15 +22,24 @@ from raganything import RAGAnything
 def check_libreoffice_installation():
-    """Check if LibreOffice is installed and available"""
+    """
    检查LibreOffice是否已安装并可用
    返回:
        bool: 如果LibreOffice可用返回True，否则返回False
    """
    import subprocess
    # 尝试不同的LibreOffice命令名称
    for cmd in ["libreoffice", "soffice"]:
        try:
            result = subprocess.run(
-                [cmd, "--version"], capture_output=True, check=True, timeout=10
+                [cmd, "--version"], 
                capture_output=True, 
                check=True, 
                timeout=10
            )
-            print(f"✅ LibreOffice found: {result.stdout.decode().strip()}")
+            print(f"✅ 找到LibreOffice: {result.stdout.decode().strip()}")
            return True
        except (
            subprocess.CalledProcessError,
@ -37,8 +48,9 @@ def check_libreoffice_installation():
        ):
            continue
-    print("❌ LibreOffice not found. Please install LibreOffice:")
+    # 如果未找到LibreOffice，显示安装指南
-    print("  - Windows: Download from https://www.libreoffice.org/download/download/")
+    print("❌ 未找到LibreOffice. 请安装LibreOffice:")
    print("  - Windows: 从 https://www.libreoffice.org/download/download/ 下载")
    print("  - macOS: brew install --cask libreoffice")
    print("  - Ubuntu/Debian: sudo apt-get install libreoffice")
    print("  - CentOS/RHEL: sudo yum install libreoffice")
@ -46,31 +58,42 @@ def check_libreoffice_installation():
 def test_office_document_parsing(file_path: str):
-    """Test Office document parsing with MinerU"""
+    """
-
+    测试Office文档解析功能
-    print(f"🧪 Testing Office document parsing: {file_path}")
+    
-
+    参数:
-    # Check if file exists and is a supported Office format
+        file_path (str): 要测试的Office文档路径
    返回:
        bool: 解析成功返回True，否则返回False
    """
    print(f"🧪 测试Office文档解析: {file_path}")
    # 检查文件是否存在且是支持的Office格式
    file_path = Path(file_path)
    if not file_path.exists():
-        print(f"❌ File does not exist: {file_path}")
+        print(f"❌ 文件不存在: {file_path}")
        return False
    # 支持的文档扩展名列表
    supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
    if file_path.suffix.lower() not in supported_extensions:
-        print(f"❌ Unsupported file format: {file_path.suffix}")
+        print(f"❌ 不支持的文档格式: {file_path.suffix}")
-        print(f"   Supported formats: {', '.join(supported_extensions)}")
+        print(f"   支持的格式: {', '.join(supported_extensions)}")
        return False
-    print(f"📄 File format: {file_path.suffix.upper()}")
+    # 显示文档基本信息
-    print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")
+    print(f"📄 文档格式: {file_path.suffix.upper()}")
    print(f"📏 文件大小: {file_path.stat().st_size / 1024:.1f} KB")
-    # Initialize RAGAnything (only for parsing functionality)
+    # 初始化RAGAnything(仅用于解析功能)
-    rag = RAGAnything(working_dir="./temp_parsing_test")
+    from raganything.config import RAGAnythingConfig
    config = RAGAnythingConfig(working_dir="./temp_parsing_test")
    rag = RAGAnything(config=config)
    try:
-        # Test document parsing with MinerU
+        # 使用MinerU测试文档解析
-        print("\n🔄 Testing document parsing with MinerU...")
+        print("\n🔄 使用MinerU测试文档解析...")
        content_list, md_content = rag.parse_document(
            file_path=str(file_path),
            output_dir="./test_output",
@ -78,11 +101,11 @@ def test_office_document_parsing(file_path: str):
            display_stats=True,
        )
-        print("✅ Parsing successful!")
+        print("✅ 解析成功!")
-        print(f"   📊 Content blocks: {len(content_list)}")
+        print(f"   📊 内容块数量: {len(content_list)}")
-        print(f"   📝 Markdown length: {len(md_content)} characters")
+        print(f"   📝 Markdown长度: {len(md_content)} 字符")
-        # Analyze content types
+        # 分析内容类型分布
        content_types = {}
        for item in content_list:
            if isinstance(item, dict):
@ -90,24 +113,24 @@ def test_office_document_parsing(file_path: str):
                content_types[content_type] = content_types.get(content_type, 0) + 1
        if content_types:
-            print("   📋 Content distribution:")
+            print("   📋 内容类型分布:")
            for content_type, count in sorted(content_types.items()):
                print(f"      • {content_type}: {count}")
-        # Display some parsed content preview
+        # 显示解析内容预览
        if md_content.strip():
-            print("\n📄 Parsed content preview (first 500 characters):")
+            print("\n📄 解析内容预览(前500字符):")
            preview = md_content.strip()[:500]
            print(f"   {preview}{'...' if len(md_content) > 500 else ''}")
-        # Display some structured content examples
+        # 显示文本块示例
        text_items = [
            item
            for item in content_list
            if isinstance(item, dict) and item.get("type") == "text"
        ]
        if text_items:
-            print("\n📝 Sample text blocks:")
+            print("\n📝 文本块示例:")
            for i, item in enumerate(text_items[:3], 1):
                text_content = item.get("text", "")
                if text_content.strip():
@ -116,76 +139,65 @@ def test_office_document_parsing(file_path: str):
                        f"   {i}. {preview}{'...' if len(text_content) > 200 else ''}"
                    )
-        # Check for images
+        # 检查图片内容
        image_items = [
            item
            for item in content_list
            if isinstance(item, dict) and item.get("type") == "image"
        ]
        if image_items:
-            print(f"\n🖼️  Found {len(image_items)} image(s):")
+            print(f"\n🖼️  找到 {len(image_items)} 张图片:")
            for i, item in enumerate(image_items, 1):
-                print(f"   {i}. Image path: {item.get('img_path', 'N/A')}")
+                print(f"   {i}. 图片路径: {item.get('img_path', 'N/A')}")
-        # Check for tables
+        # 检查表格内容
        table_items = [
            item
            for item in content_list
            if isinstance(item, dict) and item.get("type") == "table"
        ]
        if table_items:
-            print(f"\n📊 Found {len(table_items)} table(s):")
+            print(f"\n📊 找到 {len(table_items)} 个表格:")
            for i, item in enumerate(table_items, 1):
                table_body = item.get("table_body", "")
                row_count = len(table_body.split("\n"))
-                print(f"   {i}. Table with {row_count} rows")
+                print(f"   {i}. 包含 {row_count} 行的表格")
-        print("\n🎉 Office document parsing test completed successfully!")
+        print("\n🎉 Office文档解析测试成功完成!")
-        print("📁 Output files saved to: ./test_output")
+        print("📁 输出文件保存到: ./test_output")
        return True
    except Exception as e:
-        print(f"\n❌ Office document parsing failed: {str(e)}")
+        print(f"\n❌ Office文档解析失败: {str(e)}")
        import traceback
-        print(f"   Full error: {traceback.format_exc()}")
+        print(f"   完整错误: {traceback.format_exc()}")
        return False
 def main():
-    """Main function"""
+    """
-    parser = argparse.ArgumentParser(
+    主函数
-        description="Test Office document parsing with MinerU"
+    
-    )
+    处理命令行参数并执行测试
-    parser.add_argument(
+    """
-        "--file", required=True, help="Path to the Office document to test"
+    # 固定文档路径
-    )
+    file_path = r"../Txt/小学数学教学中的若干问题_MATH_1.docx"
-    parser.add_argument(
+    
-        "--check-libreoffice",
+    # 检查LibreOffice安装
-        action="store_true",
+    print("🔧 检查LibreOffice安装状态...")
        help="Only check LibreOffice installation",
    )
    args = parser.parse_args()
    # Check LibreOffice installation
    print("🔧 Checking LibreOffice installation...")
    if not check_libreoffice_installation():
        return 1
-    if args.check_libreoffice:
+    # 运行解析测试
        print("✅ LibreOffice installation check passed!")
        return 0
    # Run the parsing test
    try:
-        success = test_office_document_parsing(args.file)
+        success = test_office_document_parsing(file_path)
        return 0 if success else 1
    except KeyboardInterrupt:
-        print("\n⏹️ Test interrupted by user")
+        print("\n⏹️ 测试被用户中断")
        return 1
    except Exception as e:
-        print(f"\n❌ Unexpected error: {str(e)}")
+        print(f"\n❌ 发生意外错误: {str(e)}")
        return 1
--- a/dsRagAnything/Txt/小学数学教学中的若干问题_MATH_1.docx
+++ b/dsRagAnything/Txt/小学数学教学中的若干问题_MATH_1.docx
--- a/dsRagAnything/raganything/pycache/init.cpython-310.pyc
+++ b/dsRagAnything/raganything/pycache/init.cpython-310.pyc
--- a/dsRagAnything/raganything/pycache/batch.cpython-310.pyc
+++ b/dsRagAnything/raganything/pycache/batch.cpython-310.pyc
--- a/dsRagAnything/raganything/pycache/config.cpython-310.pyc
+++ b/dsRagAnything/raganything/pycache/config.cpython-310.pyc
--- a/dsRagAnything/raganything/pycache/mineru_parser.cpython-310.pyc
+++ b/dsRagAnything/raganything/pycache/mineru_parser.cpython-310.pyc
--- a/dsRagAnything/raganything/pycache/modalprocessors.cpython-310.pyc
+++ b/dsRagAnything/raganything/pycache/modalprocessors.cpython-310.pyc
--- a/dsRagAnything/raganything/pycache/processor.cpython-310.pyc
+++ b/dsRagAnything/raganything/pycache/processor.cpython-310.pyc
--- a/dsRagAnything/raganything/pycache/prompt.cpython-310.pyc
+++ b/dsRagAnything/raganything/pycache/prompt.cpython-310.pyc
--- a/dsRagAnything/raganything/pycache/query.cpython-310.pyc
+++ b/dsRagAnything/raganything/pycache/query.cpython-310.pyc
--- a/dsRagAnything/raganything/pycache/raganything.cpython-310.pyc
+++ b/dsRagAnything/raganything/pycache/raganything.cpython-310.pyc
--- a/dsRagAnything/raganything/pycache/utils.cpython-310.pyc
+++ b/dsRagAnything/raganything/pycache/utils.cpython-310.pyc