'commit'

3 weeks ago · 9f5f1480dc
parent 682cad6910
commit 9f5f1480dc
4 changed files with 183 additions and 253 deletions
--- a/dsRagAnything/Doc/文档.txt
+++ b/dsRagAnything/Doc/文档.txt
@ -38,3 +38,6 @@ python examples/image_format_test.py --check-pillow --file dummy

 # Check ReportLab installation
 python examples/text_format_test.py --check-reportlab --file dummy
+
+# MinerU
+https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md
--- a/dsRagAnything/Tools/T1_Office_document_test.py
+++ b/dsRagAnything/Tools/T1_Office_document_test.py
@ -1,47 +1,34 @@
 #!/usr/bin/env python3
 """
-Office文档解析测试脚本 - RAG-Anything项目
+Office Document Parsing Test Script for RAG-Anything

-本脚本演示如何使用MinerU解析各种Office文档格式，包括：
- DOC/DOCX (Word文档)
- PPT/PPTX (PowerPoint演示文稿)
- XLS/XLSX (Excel电子表格)
+This script demonstrates how to parse various Office document formats
+using MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files.

-要求：
-1. 系统已安装LibreOffice
-2. 已安装RAG-Anything包
+Requirements:
+- LibreOffice installed on the system
+- RAG-Anything package

-使用方法：
-    python office_document_test.py --file 办公文档路径.docx
+Usage:
+    python office_document_test.py --file path/to/office/document.docx
 """

 import argparse
 import sys
 from pathlib import Path
-from raganything import RAGAnything, RAGAnythingConfig
+from raganything import RAGAnything


 def check_libreoffice_installation():
-    """
-    检查LibreOffice是否已安装并可用
-    
-    返回:
-        bool: 如果LibreOffice可用返回True，否则返回False
-    """
-
-    """
+    """Check if LibreOffice is installed and available"""
    import subprocess

-    # 尝试不同的LibreOffice命令名称
    for cmd in ["libreoffice", "soffice"]:
        try:
            result = subprocess.run(
-                [cmd, "--version"], 
-                capture_output=True, 
-                check=True, 
-                timeout=10
+                [cmd, "--version"], capture_output=True, check=True, timeout=10
            )
-            print(f"✅ 找到LibreOffice: {result.stdout.decode().strip()}")
+            print(f"✅ LibreOffice found: {result.stdout.decode().strip()}")
            return True
        except (
            subprocess.CalledProcessError,
@ -50,97 +37,52 @@ def check_libreoffice_installation():
        ):
            continue

-    # 如果未找到LibreOffice，显示安装指南
-    print("❌ 未找到LibreOffice. 请安装LibreOffice:")
-    print("  - Windows: 从 https://www.libreoffice.org/download/download/ 下载")
+    print("❌ LibreOffice not found. Please install LibreOffice:")
+    print("  - Windows: Download from https://www.libreoffice.org/download/download/")
    print("  - macOS: brew install --cask libreoffice")
    print("  - Ubuntu/Debian: sudo apt-get install libreoffice")
    print("  - CentOS/RHEL: sudo yum install libreoffice")
    return False
-    """
-    return True


 def test_office_document_parsing(file_path: str):
-    """
-    测试Office文档解析功能
+    """Test Office document parsing with MinerU"""

-    参数:
-        file_path (str): 要测试的Office文档路径
+    print(f"🧪 Testing Office document parsing: {file_path}")

-    返回:
-        bool: 解析成功返回True，否则返回False
-    """
-    # 在test_office_document_parsing函数中添加
-    import os
-    os.environ["LIBREOFFICE_PATH"] = "C:\\Program Files\\LibreOffice\\program"
-    print(f"🧪 测试Office文档解析: {file_path}")
-
-    # 检查文件是否存在且是支持的Office格式
+    # Check if file exists and is a supported Office format
    file_path = Path(file_path)
    if not file_path.exists():
-        print(f"❌ 文件不存在: {file_path}")
+        print(f"❌ File does not exist: {file_path}")
        return False

-    # 支持的文档扩展名列表
    supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
    if file_path.suffix.lower() not in supported_extensions:
-        print(f"❌ 不支持的文档格式: {file_path.suffix}")
-        print(f"   支持的格式: {', '.join(supported_extensions)}")
+        print(f"❌ Unsupported file format: {file_path.suffix}")
+        print(f"   Supported formats: {', '.join(supported_extensions)}")
        return False

-    # 显示文档基本信息
-    print(f"📄 文档格式: {file_path.suffix.upper()}")
-    print(f"📏 文件大小: {file_path.stat().st_size / 1024:.1f} KB")
+    print(f"📄 File format: {file_path.suffix.upper()}")
+    print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")

-    # 初始化RAGAnything(仅用于解析功能)
-    from raganything.config import RAGAnythingConfig
-    config = RAGAnythingConfig(working_dir="./temp_parsing_test")
-    rag = RAGAnything(config=config)
+    # Initialize RAGAnything (only for parsing functionality)
+    rag = RAGAnything()

    try:
-        # 添加MinerU安装检查
-        from raganything.mineru_parser import MineruParser
-        if not MineruParser.check_installation():
-            print("❌ MinerU未正确安装")
-            return False
-            
-        # 确保output_dir已定义
-        output_dir = "./test_output"
-        Path(output_dir).mkdir(exist_ok=True)
-        
-        # 添加PDF转换检查
-        pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
-        print(f"PDF转换路径: {pdf_path}")
-        if pdf_path.exists():
-            print(f"✅ PDF已生成，大小: {pdf_path.stat().st_size}字节")
-        else:
-            print("❌ PDF转换失败")
-            
-        # 使用MinerU测试文档解析
-        print("\n🔄 使用MinerU测试文档解析...")
-        
-        # 使用绝对路径确保输出目录位置明确
-        output_dir = Path("d:/dsWork/dsProject/dsRagAnything/Tools/test_output").absolute()
-        output_dir.mkdir(exist_ok=True)
-        
+        # Test document parsing with MinerU
+        print("\n🔄 Testing document parsing with MinerU...")
        content_list, md_content = rag.parse_document(
            file_path=str(file_path),
-            output_dir=str(output_dir),
+            output_dir="./test_output",
            parse_method="auto",
            display_stats=True,
        )

-        # 检查输出目录内容
-        print(f"\n📂 输出目录内容({output_dir}):")
-        for f in output_dir.glob("*"):
-            print(f"   - {f.name}")
-        
-        print("✅ 解析成功!")
-        print(f"   📊 内容块数量: {len(content_list)}")
-        print(f"   📝 Markdown长度: {len(md_content)} 字符")
+        print("✅ Parsing successful!")
+        print(f"   📊 Content blocks: {len(content_list)}")
+        print(f"   📝 Markdown length: {len(md_content)} characters")

-        # 分析内容类型分布
+        # Analyze content types
        content_types = {}
        for item in content_list:
            if isinstance(item, dict):
@ -148,24 +90,24 @@ def test_office_document_parsing(file_path: str):
                content_types[content_type] = content_types.get(content_type, 0) + 1

        if content_types:
-            print("   📋 内容类型分布:")
+            print("   📋 Content distribution:")
            for content_type, count in sorted(content_types.items()):
                print(f"      • {content_type}: {count}")

-        # 显示解析内容预览
+        # Display some parsed content preview
        if md_content.strip():
-            print("\n📄 解析内容预览(前500字符):")
+            print("\n📄 Parsed content preview (first 500 characters):")
            preview = md_content.strip()[:500]
            print(f"   {preview}{'...' if len(md_content) > 500 else ''}")

-        # 显示文本块示例
+        # Display some structured content examples
        text_items = [
            item
            for item in content_list
            if isinstance(item, dict) and item.get("type") == "text"
        ]
        if text_items:
-            print("\n📝 文本块示例:")
+            print("\n📝 Sample text blocks:")
            for i, item in enumerate(text_items[:3], 1):
                text_content = item.get("text", "")
                if text_content.strip():
@ -174,73 +116,56 @@ def test_office_document_parsing(file_path: str):
                        f"   {i}. {preview}{'...' if len(text_content) > 200 else ''}"
                    )

-        # 检查图片内容
+        # Check for images
        image_items = [
            item
            for item in content_list
            if isinstance(item, dict) and item.get("type") == "image"
        ]
        if image_items:
-            print(f"\n🖼️  找到 {len(image_items)} 张图片:")
+            print(f"\n🖼️  Found {len(image_items)} image(s):")
            for i, item in enumerate(image_items, 1):
-                print(f"   {i}. 图片路径: {item.get('img_path', 'N/A')}")
+                print(f"   {i}. Image path: {item.get('img_path', 'N/A')}")

-        # 检查表格内容
+        # Check for tables
        table_items = [
            item
            for item in content_list
            if isinstance(item, dict) and item.get("type") == "table"
        ]
        if table_items:
-            print(f"\n📊 找到 {len(table_items)} 个表格:")
+            print(f"\n📊 Found {len(table_items)} table(s):")
            for i, item in enumerate(table_items, 1):
                table_body = item.get("table_body", "")
                row_count = len(table_body.split("\n"))
-                print(f"   {i}. 包含 {row_count} 行的表格")
+                print(f"   {i}. Table with {row_count} rows")

-        print("\n🎉 Office文档解析测试成功完成!")
-        print("📁 输出文件保存到: ./test_output")
+        print("\n🎉 Office document parsing test completed successfully!")
+        print("📁 Output files saved to: ./test_output")
        return True

-        # 在test_office_document_parsing函数中添加
-        pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
-        print(f"PDF文件内容预览(前100字符): {pdf_path.read_text()[:100]}")
-
    except Exception as e:
-        print(f"\n❌ Office文档解析失败: {str(e)}")
+        print(f"\n❌ Office document parsing failed: {str(e)}")
        import traceback

-        print(f"   完整错误: {traceback.format_exc()}")
+        print(f"   Full error: {traceback.format_exc()}")
        return False


 def main():
-    """
-    主函数
-    
-    处理命令行参数并执行测试
-    """
-    # 固定文档路径
-    file_path = r"../Txt/小学数学教学中的若干问题_MATH_1.docx"
+    file=r"D:\dsWork\dsProject\dsRagAnything\Txt\小学数学教学中的若干问题_MATH_1.docx"

-    # 检查LibreOffice安装
-    print("🔧 检查LibreOffice安装状态...")
-    if not check_libreoffice_installation():
-        return 1
-
-    # 运行解析测试
+    # Run the parsing test
    try:
-        success = test_office_document_parsing(file_path)
+        success = test_office_document_parsing(file)
        return 0 if success else 1
    except KeyboardInterrupt:
-        print("\n⏹️ 测试被用户中断")
+        print("\n⏹️ Test interrupted by user")
        return 1
    except Exception as e:
-        print(f"\n❌ 发生意外错误: {str(e)}")
+        print(f"\n❌ Unexpected error: {str(e)}")
        return 1


 if __name__ == "__main__":
    sys.exit(main())
-
-
--- a/dsRagAnything/examples/office_document_test.py
+++ b/dsRagAnything/examples/office_document_test.py
@ -19,31 +19,6 @@ from pathlib import Path
 from raganything import RAGAnything


-def check_libreoffice_installation():
-    """Check if LibreOffice is installed and available"""
-    import subprocess
-
-    for cmd in ["libreoffice", "soffice"]:
-        try:
-            result = subprocess.run(
-                [cmd, "--version"], capture_output=True, check=True, timeout=10
-            )
-            print(f"✅ LibreOffice found: {result.stdout.decode().strip()}")
-            return True
-        except (
-            subprocess.CalledProcessError,
-            FileNotFoundError,
-            subprocess.TimeoutExpired,
-        ):
-            continue
-
-    print("❌ LibreOffice not found. Please install LibreOffice:")
-    print("  - Windows: Download from https://www.libreoffice.org/download/download/")
-    print("  - macOS: brew install --cask libreoffice")
-    print("  - Ubuntu/Debian: sudo apt-get install libreoffice")
-    print("  - CentOS/RHEL: sudo yum install libreoffice")
-    return False
-

 def test_office_document_parsing(file_path: str):
    """Test Office document parsing with MinerU"""
@ -168,10 +143,7 @@ def main():

    args = parser.parse_args()

-    # Check LibreOffice installation
-    print("🔧 Checking LibreOffice installation...")
-    if not check_libreoffice_installation():
-        return 1
+

    if args.check_libreoffice:
        print("✅ LibreOffice installation check passed!")
--- a/dsRagAnything/raganything/mineru_parser.py
+++ b/dsRagAnything/raganything/mineru_parser.py
@ -77,6 +77,34 @@ class MineruParser:
            device: Inference device
            source: Model source
        """
+        # 【黄海】 MinerU需要下载模型，可以从国内的源下载：
+        # https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#本地部署
+        # mineru-models-download
+        # 居然在下载 OCR/paddleocr_torch/， 果然是个好东西！
+        """
+        (raganything) PS D:\dsWork\dsProject\dsRagAnything> mineru-models-download
+        Please select the model download source:  (huggingface, modelscope) [huggingface]: modelscope
+        Please select the model type to download:  (pipeline, vlm, all) [all]: all
+        Downloading all model from modelscope...
+        Downloading model: models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
+        Download failed: Missing dependencies for SOCKS support.
+        (raganything) PS D:\dsWork\dsProject\dsRagAnything> mineru-models-download
+        Please select the model download source:  (huggingface, modelscope) [huggingface]: modelscope
+        Please select the model type to download:  (pipeline, vlm, all) [all]: all
+        Downloading all model from modelscope...
+        Downloading model: models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
+        Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
+        2025-07-04 21:46:23,860 - modelscope - INFO - Got 1 files, start to download ...
+        Downloading [models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt]: 100%|█| 37.9M/37.9M [00:02<00:00, 15.
+        Processing 1 items: 100%|███████████████████████████████████████████████████████████| 1.00/1.00 [00:02<00:00, 2.64s/it]
+        2025-07-04 21:46:26,507 - modelscope - INFO - Download model 'OpenDataLab/PDF-Extract-Kit-1.0' successfully.
+        2025-07-04 21:46:26,507 - modelscope - INFO - Creating symbolic link [C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0].
+        Downloading model: models/MFD/YOLO/yolo_v8_ft.pt
+        Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
+        2025-07-04 21:46:29,616 - modelscope - INFO - Got 1 files, start to download ...
+        Processing 1 items:   0%|                                                                  | 0.00/1.00 [00:00<?, ?it/s]
+        Downloading [models/MFD/YOLO/yolo_v8_ft.pt]:  31%|██████████▌                       | 104M/334M [00:06<00:13, 17.7MB/s]
+        """
        cmd = [
            "mineru",
            "-p",
@ -437,64 +465,64 @@ class MineruParser:
            if doc_path.suffix.lower() not in supported_office_formats:
                raise ValueError(f"Unsupported office format: {doc_path.suffix}")

-            # Check if LibreOffice is available
-            libreoffice_available = False
-            working_libreoffice_cmd = None
-            try:
-                result = subprocess.run(
-                    ["libreoffice", "--version"],
-                    capture_output=True,
-                    check=True,
-                    timeout=10,
-                    encoding="utf-8",
-                    errors="ignore",
-                )
-                libreoffice_available = True
-                working_libreoffice_cmd = "libreoffice"
-                print(f"LibreOffice detected: {result.stdout.strip()}")
-            except (
-                subprocess.CalledProcessError,
-                FileNotFoundError,
-                subprocess.TimeoutExpired,
-            ):
-                pass
-
-            # Try alternative commands for LibreOffice
-            if not libreoffice_available:
-                for cmd in ["soffice", "libreoffice"]:
-                    try:
-                        result = subprocess.run(
-                            [cmd, "--version"],
-                            capture_output=True,
-                            check=True,
-                            timeout=10,
-                            encoding="utf-8",
-                            errors="ignore",
-                        )
-                        libreoffice_available = True
-                        working_libreoffice_cmd = cmd
-                        print(
-                            f"LibreOffice detected with command '{cmd}': {result.stdout.strip()}"
-                        )
-                        break
-                    except (
-                        subprocess.CalledProcessError,
-                        FileNotFoundError,
-                        subprocess.TimeoutExpired,
-                    ):
-                        continue
-
-            if not libreoffice_available:
-                raise RuntimeError(
-                    "LibreOffice is required for Office document conversion but was not found.\n"
-                    "Please install LibreOffice:\n"
-                    "- Windows: Download from https://www.libreoffice.org/download/download/\n"
-                    "- macOS: brew install --cask libreoffice\n"
-                    "- Ubuntu/Debian: sudo apt-get install libreoffice\n"
-                    "- CentOS/RHEL: sudo yum install libreoffice\n"
-                    "Alternatively, convert the document to PDF manually.\n"
-                    "MinerU 2.0 no longer includes built-in Office document conversion."
-                )
+            # # Check if LibreOffice is available
+            # libreoffice_available = False
+            working_libreoffice_cmd = 'soffice'
+            # try:
+            #     result = subprocess.run(
+            #         ["libreoffice", "--version"],
+            #         capture_output=True,
+            #         check=True,
+            #         timeout=10,
+            #         encoding="utf-8",
+            #         errors="ignore",
+            #     )
+            #     libreoffice_available = True
+            #     working_libreoffice_cmd = "libreoffice"
+            #     print(f"LibreOffice detected: {result.stdout.strip()}")
+            # except (
+            #     subprocess.CalledProcessError,
+            #     FileNotFoundError,
+            #     subprocess.TimeoutExpired,
+            # ):
+            #     pass
+            #
+            # # Try alternative commands for LibreOffice
+            # if not libreoffice_available:
+            #     for cmd in ["soffice", "libreoffice"]:
+            #         try:
+            #             result = subprocess.run(
+            #                 [cmd, "--version"],
+            #                 capture_output=True,
+            #                 check=True,
+            #                 timeout=10,
+            #                 encoding="utf-8",
+            #                 errors="ignore",
+            #             )
+            #             libreoffice_available = True
+            #             working_libreoffice_cmd = cmd
+            #             print(
+            #                 f"LibreOffice detected with command '{cmd}': {result.stdout.strip()}"
+            #             )
+            #             break
+            #         except (
+            #             subprocess.CalledProcessError,
+            #             FileNotFoundError,
+            #             subprocess.TimeoutExpired,
+            #         ):
+            #             continue
+            #
+            # if not libreoffice_available:
+            #     raise RuntimeError(
+            #         "LibreOffice is required for Office document conversion but was not found.\n"
+            #         "Please install LibreOffice:\n"
+            #         "- Windows: Download from https://www.libreoffice.org/download/download/\n"
+            #         "- macOS: brew install --cask libreoffice\n"
+            #         "- Ubuntu/Debian: sudo apt-get install libreoffice\n"
+            #         "- CentOS/RHEL: sudo yum install libreoffice\n"
+            #         "Alternatively, convert the document to PDF manually.\n"
+            #         "MinerU 2.0 no longer includes built-in Office document conversion."
+            #     )

            # Create temporary directory for PDF conversion
            with tempfile.TemporaryDirectory() as temp_dir:
@ -535,6 +563,7 @@ class MineruParser:
                        if result.returncode == 0:
                            conversion_successful = True
                            print(f"Successfully converted {doc_path.name} to PDF")
+                            print(convert_cmd)
                            break
                        else:
                            print(
@ -572,6 +601,7 @@ class MineruParser:
                    )

                # Parse the converted PDF
+                # TODO
                return MineruParser.parse_pdf(
                    pdf_path=pdf_path, output_dir=output_dir, **kwargs
                )