diff --git a/dsRagAnything/Doc/文档.txt b/dsRagAnything/Doc/文档.txt index 98fb5cc0..866a995d 100644 --- a/dsRagAnything/Doc/文档.txt +++ b/dsRagAnything/Doc/文档.txt @@ -37,4 +37,7 @@ python examples/text_format_test.py --file path/to/document.md python examples/image_format_test.py --check-pillow --file dummy # Check ReportLab installation -python examples/text_format_test.py --check-reportlab --file dummy \ No newline at end of file +python examples/text_format_test.py --check-reportlab --file dummy + +# MinerU +https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md diff --git a/dsRagAnything/Tools/T1_Office_document_test.py b/dsRagAnything/Tools/T1_Office_document_test.py index 50476ba5..614da24b 100644 --- a/dsRagAnything/Tools/T1_Office_document_test.py +++ b/dsRagAnything/Tools/T1_Office_document_test.py @@ -1,47 +1,34 @@ #!/usr/bin/env python3 """ -Office文档解析测试脚本 - RAG-Anything项目 +Office Document Parsing Test Script for RAG-Anything -本脚本演示如何使用MinerU解析各种Office文档格式,包括: -- DOC/DOCX (Word文档) -- PPT/PPTX (PowerPoint演示文稿) -- XLS/XLSX (Excel电子表格) +This script demonstrates how to parse various Office document formats +using MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files. -要求: -1. 系统已安装LibreOffice -2. 已安装RAG-Anything包 +Requirements: +- LibreOffice installed on the system +- RAG-Anything package -使用方法: - python office_document_test.py --file 办公文档路径.docx +Usage: + python office_document_test.py --file path/to/office/document.docx """ import argparse import sys from pathlib import Path -from raganything import RAGAnything, RAGAnythingConfig +from raganything import RAGAnything def check_libreoffice_installation(): - """ - 检查LibreOffice是否已安装并可用 - - 返回: - bool: 如果LibreOffice可用返回True,否则返回False - """ - - """ + """Check if LibreOffice is installed and available""" import subprocess - # 尝试不同的LibreOffice命令名称 for cmd in ["libreoffice", "soffice"]: try: result = subprocess.run( - [cmd, "--version"], - capture_output=True, - check=True, - timeout=10 + [cmd, "--version"], capture_output=True, check=True, timeout=10 ) - print(f"✅ 找到LibreOffice: {result.stdout.decode().strip()}") + print(f"✅ LibreOffice found: {result.stdout.decode().strip()}") return True except ( subprocess.CalledProcessError, @@ -50,97 +37,52 @@ def check_libreoffice_installation(): ): continue - # 如果未找到LibreOffice,显示安装指南 - print("❌ 未找到LibreOffice. 请安装LibreOffice:") - print(" - Windows: 从 https://www.libreoffice.org/download/download/ 下载") + print("❌ LibreOffice not found. Please install LibreOffice:") + print(" - Windows: Download from https://www.libreoffice.org/download/download/") print(" - macOS: brew install --cask libreoffice") print(" - Ubuntu/Debian: sudo apt-get install libreoffice") print(" - CentOS/RHEL: sudo yum install libreoffice") return False - """ - return True def test_office_document_parsing(file_path: str): - """ - 测试Office文档解析功能 - - 参数: - file_path (str): 要测试的Office文档路径 - - 返回: - bool: 解析成功返回True,否则返回False - """ - # 在test_office_document_parsing函数中添加 - import os - os.environ["LIBREOFFICE_PATH"] = "C:\\Program Files\\LibreOffice\\program" - print(f"🧪 测试Office文档解析: {file_path}") - - # 检查文件是否存在且是支持的Office格式 + """Test Office document parsing with MinerU""" + + print(f"🧪 Testing Office document parsing: {file_path}") + + # Check if file exists and is a supported Office format file_path = Path(file_path) if not file_path.exists(): - print(f"❌ 文件不存在: {file_path}") + print(f"❌ File does not exist: {file_path}") return False - # 支持的文档扩展名列表 supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"} if file_path.suffix.lower() not in supported_extensions: - print(f"❌ 不支持的文档格式: {file_path.suffix}") - print(f" 支持的格式: {', '.join(supported_extensions)}") + print(f"❌ Unsupported file format: {file_path.suffix}") + print(f" Supported formats: {', '.join(supported_extensions)}") return False - # 显示文档基本信息 - print(f"📄 文档格式: {file_path.suffix.upper()}") - print(f"📏 文件大小: {file_path.stat().st_size / 1024:.1f} KB") + print(f"📄 File format: {file_path.suffix.upper()}") + print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB") - # 初始化RAGAnything(仅用于解析功能) - from raganything.config import RAGAnythingConfig - config = RAGAnythingConfig(working_dir="./temp_parsing_test") - rag = RAGAnything(config=config) + # Initialize RAGAnything (only for parsing functionality) + rag = RAGAnything() try: - # 添加MinerU安装检查 - from raganything.mineru_parser import MineruParser - if not MineruParser.check_installation(): - print("❌ MinerU未正确安装") - return False - - # 确保output_dir已定义 - output_dir = "./test_output" - Path(output_dir).mkdir(exist_ok=True) - - # 添加PDF转换检查 - pdf_path = Path(output_dir) / f"{file_path.stem}.pdf" - print(f"PDF转换路径: {pdf_path}") - if pdf_path.exists(): - print(f"✅ PDF已生成,大小: {pdf_path.stat().st_size}字节") - else: - print("❌ PDF转换失败") - - # 使用MinerU测试文档解析 - print("\n🔄 使用MinerU测试文档解析...") - - # 使用绝对路径确保输出目录位置明确 - output_dir = Path("d:/dsWork/dsProject/dsRagAnything/Tools/test_output").absolute() - output_dir.mkdir(exist_ok=True) - + # Test document parsing with MinerU + print("\n🔄 Testing document parsing with MinerU...") content_list, md_content = rag.parse_document( file_path=str(file_path), - output_dir=str(output_dir), + output_dir="./test_output", parse_method="auto", display_stats=True, ) - - # 检查输出目录内容 - print(f"\n📂 输出目录内容({output_dir}):") - for f in output_dir.glob("*"): - print(f" - {f.name}") - - print("✅ 解析成功!") - print(f" 📊 内容块数量: {len(content_list)}") - print(f" 📝 Markdown长度: {len(md_content)} 字符") - - # 分析内容类型分布 + + print("✅ Parsing successful!") + print(f" 📊 Content blocks: {len(content_list)}") + print(f" 📝 Markdown length: {len(md_content)} characters") + + # Analyze content types content_types = {} for item in content_list: if isinstance(item, dict): @@ -148,24 +90,24 @@ def test_office_document_parsing(file_path: str): content_types[content_type] = content_types.get(content_type, 0) + 1 if content_types: - print(" 📋 内容类型分布:") + print(" 📋 Content distribution:") for content_type, count in sorted(content_types.items()): print(f" • {content_type}: {count}") - # 显示解析内容预览 + # Display some parsed content preview if md_content.strip(): - print("\n📄 解析内容预览(前500字符):") + print("\n📄 Parsed content preview (first 500 characters):") preview = md_content.strip()[:500] print(f" {preview}{'...' if len(md_content) > 500 else ''}") - # 显示文本块示例 + # Display some structured content examples text_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "text" ] if text_items: - print("\n📝 文本块示例:") + print("\n📝 Sample text blocks:") for i, item in enumerate(text_items[:3], 1): text_content = item.get("text", "") if text_content.strip(): @@ -174,73 +116,56 @@ def test_office_document_parsing(file_path: str): f" {i}. {preview}{'...' if len(text_content) > 200 else ''}" ) - # 检查图片内容 + # Check for images image_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "image" ] if image_items: - print(f"\n🖼️ 找到 {len(image_items)} 张图片:") + print(f"\n🖼️ Found {len(image_items)} image(s):") for i, item in enumerate(image_items, 1): - print(f" {i}. 图片路径: {item.get('img_path', 'N/A')}") + print(f" {i}. Image path: {item.get('img_path', 'N/A')}") - # 检查表格内容 + # Check for tables table_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "table" ] if table_items: - print(f"\n📊 找到 {len(table_items)} 个表格:") + print(f"\n📊 Found {len(table_items)} table(s):") for i, item in enumerate(table_items, 1): table_body = item.get("table_body", "") row_count = len(table_body.split("\n")) - print(f" {i}. 包含 {row_count} 行的表格") + print(f" {i}. Table with {row_count} rows") - print("\n🎉 Office文档解析测试成功完成!") - print("📁 输出文件保存到: ./test_output") + print("\n🎉 Office document parsing test completed successfully!") + print("📁 Output files saved to: ./test_output") return True - # 在test_office_document_parsing函数中添加 - pdf_path = Path(output_dir) / f"{file_path.stem}.pdf" - print(f"PDF文件内容预览(前100字符): {pdf_path.read_text()[:100]}") - except Exception as e: - print(f"\n❌ Office文档解析失败: {str(e)}") + print(f"\n❌ Office document parsing failed: {str(e)}") import traceback - print(f" 完整错误: {traceback.format_exc()}") + print(f" Full error: {traceback.format_exc()}") return False def main(): - """ - 主函数 - - 处理命令行参数并执行测试 - """ - # 固定文档路径 - file_path = r"../Txt/小学数学教学中的若干问题_MATH_1.docx" - - # 检查LibreOffice安装 - print("🔧 检查LibreOffice安装状态...") - if not check_libreoffice_installation(): - return 1 + file=r"D:\dsWork\dsProject\dsRagAnything\Txt\小学数学教学中的若干问题_MATH_1.docx" - # 运行解析测试 + # Run the parsing test try: - success = test_office_document_parsing(file_path) + success = test_office_document_parsing(file) return 0 if success else 1 except KeyboardInterrupt: - print("\n⏹️ 测试被用户中断") + print("\n⏹️ Test interrupted by user") return 1 except Exception as e: - print(f"\n❌ 发生意外错误: {str(e)}") + print(f"\n❌ Unexpected error: {str(e)}") return 1 if __name__ == "__main__": - sys.exit(main()) - - + sys.exit(main()) \ No newline at end of file diff --git a/dsRagAnything/examples/office_document_test.py b/dsRagAnything/examples/office_document_test.py index b46daab0..d330f72c 100644 --- a/dsRagAnything/examples/office_document_test.py +++ b/dsRagAnything/examples/office_document_test.py @@ -19,31 +19,6 @@ from pathlib import Path from raganything import RAGAnything -def check_libreoffice_installation(): - """Check if LibreOffice is installed and available""" - import subprocess - - for cmd in ["libreoffice", "soffice"]: - try: - result = subprocess.run( - [cmd, "--version"], capture_output=True, check=True, timeout=10 - ) - print(f"✅ LibreOffice found: {result.stdout.decode().strip()}") - return True - except ( - subprocess.CalledProcessError, - FileNotFoundError, - subprocess.TimeoutExpired, - ): - continue - - print("❌ LibreOffice not found. Please install LibreOffice:") - print(" - Windows: Download from https://www.libreoffice.org/download/download/") - print(" - macOS: brew install --cask libreoffice") - print(" - Ubuntu/Debian: sudo apt-get install libreoffice") - print(" - CentOS/RHEL: sudo yum install libreoffice") - return False - def test_office_document_parsing(file_path: str): """Test Office document parsing with MinerU""" @@ -168,10 +143,7 @@ def main(): args = parser.parse_args() - # Check LibreOffice installation - print("🔧 Checking LibreOffice installation...") - if not check_libreoffice_installation(): - return 1 + if args.check_libreoffice: print("✅ LibreOffice installation check passed!") diff --git a/dsRagAnything/raganything/mineru_parser.py b/dsRagAnything/raganything/mineru_parser.py index d1c60f39..d3d186a0 100644 --- a/dsRagAnything/raganything/mineru_parser.py +++ b/dsRagAnything/raganything/mineru_parser.py @@ -49,17 +49,17 @@ class MineruParser: @staticmethod def _run_mineru_command( - input_path: Union[str, Path], - output_dir: Union[str, Path], - method: str = "auto", - lang: Optional[str] = None, - backend: str = "pipeline", - start_page: Optional[int] = None, - end_page: Optional[int] = None, - formula: bool = True, - table: bool = True, - device: Optional[str] = None, - source: str = "huggingface", + input_path: Union[str, Path], + output_dir: Union[str, Path], + method: str = "auto", + lang: Optional[str] = None, + backend: str = "pipeline", + start_page: Optional[int] = None, + end_page: Optional[int] = None, + formula: bool = True, + table: bool = True, + device: Optional[str] = None, + source: str = "huggingface", ) -> None: """ Run mineru command line tool @@ -77,6 +77,34 @@ class MineruParser: device: Inference device source: Model source """ + # 【黄海】 MinerU需要下载模型,可以从国内的源下载: + # https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#本地部署 + # mineru-models-download + # 居然在下载 OCR/paddleocr_torch/, 果然是个好东西! + """ + (raganything) PS D:\dsWork\dsProject\dsRagAnything> mineru-models-download + Please select the model download source: (huggingface, modelscope) [huggingface]: modelscope + Please select the model type to download: (pipeline, vlm, all) [all]: all + Downloading all model from modelscope... + Downloading model: models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt + Download failed: Missing dependencies for SOCKS support. + (raganything) PS D:\dsWork\dsProject\dsRagAnything> mineru-models-download + Please select the model download source: (huggingface, modelscope) [huggingface]: modelscope + Please select the model type to download: (pipeline, vlm, all) [all]: all + Downloading all model from modelscope... + Downloading model: models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt + Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0 + 2025-07-04 21:46:23,860 - modelscope - INFO - Got 1 files, start to download ... + Downloading [models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt]: 100%|█| 37.9M/37.9M [00:02<00:00, 15. + Processing 1 items: 100%|███████████████████████████████████████████████████████████| 1.00/1.00 [00:02<00:00, 2.64s/it] + 2025-07-04 21:46:26,507 - modelscope - INFO - Download model 'OpenDataLab/PDF-Extract-Kit-1.0' successfully. + 2025-07-04 21:46:26,507 - modelscope - INFO - Creating symbolic link [C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0]. + Downloading model: models/MFD/YOLO/yolo_v8_ft.pt + Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0 + 2025-07-04 21:46:29,616 - modelscope - INFO - Got 1 files, start to download ... + Processing 1 items: 0%| | 0.00/1.00 [00:00 Tuple[List[Dict[str, Any]], str]: """ Read the output files generated by mineru @@ -197,11 +225,11 @@ class MineruParser: @staticmethod def parse_pdf( - pdf_path: Union[str, Path], - output_dir: Optional[str] = None, - method: str = "auto", - lang: Optional[str] = None, - **kwargs, + pdf_path: Union[str, Path], + output_dir: Optional[str] = None, + method: str = "auto", + lang: Optional[str] = None, + **kwargs, ) -> Tuple[List[Dict[str, Any]], str]: """ Parse PDF document using MinerU 2.0 @@ -254,10 +282,10 @@ class MineruParser: @staticmethod def parse_image( - image_path: Union[str, Path], - output_dir: Optional[str] = None, - lang: Optional[str] = None, - **kwargs, + image_path: Union[str, Path], + output_dir: Optional[str] = None, + lang: Optional[str] = None, + **kwargs, ) -> Tuple[List[Dict[str, Any]], str]: """ Parse image document using MinerU 2.0 @@ -402,7 +430,7 @@ class MineruParser: @staticmethod def parse_office_doc( - doc_path: Union[str, Path], output_dir: Optional[str] = None, **kwargs + doc_path: Union[str, Path], output_dir: Optional[str] = None, **kwargs ) -> Tuple[List[Dict[str, Any]], str]: """ Parse office document by first converting to PDF, then parsing with MinerU 2.0 @@ -437,64 +465,64 @@ class MineruParser: if doc_path.suffix.lower() not in supported_office_formats: raise ValueError(f"Unsupported office format: {doc_path.suffix}") - # Check if LibreOffice is available - libreoffice_available = False - working_libreoffice_cmd = None - try: - result = subprocess.run( - ["libreoffice", "--version"], - capture_output=True, - check=True, - timeout=10, - encoding="utf-8", - errors="ignore", - ) - libreoffice_available = True - working_libreoffice_cmd = "libreoffice" - print(f"LibreOffice detected: {result.stdout.strip()}") - except ( - subprocess.CalledProcessError, - FileNotFoundError, - subprocess.TimeoutExpired, - ): - pass - - # Try alternative commands for LibreOffice - if not libreoffice_available: - for cmd in ["soffice", "libreoffice"]: - try: - result = subprocess.run( - [cmd, "--version"], - capture_output=True, - check=True, - timeout=10, - encoding="utf-8", - errors="ignore", - ) - libreoffice_available = True - working_libreoffice_cmd = cmd - print( - f"LibreOffice detected with command '{cmd}': {result.stdout.strip()}" - ) - break - except ( - subprocess.CalledProcessError, - FileNotFoundError, - subprocess.TimeoutExpired, - ): - continue - - if not libreoffice_available: - raise RuntimeError( - "LibreOffice is required for Office document conversion but was not found.\n" - "Please install LibreOffice:\n" - "- Windows: Download from https://www.libreoffice.org/download/download/\n" - "- macOS: brew install --cask libreoffice\n" - "- Ubuntu/Debian: sudo apt-get install libreoffice\n" - "- CentOS/RHEL: sudo yum install libreoffice\n" - "Alternatively, convert the document to PDF manually.\n" - "MinerU 2.0 no longer includes built-in Office document conversion." - ) + # # Check if LibreOffice is available + # libreoffice_available = False + working_libreoffice_cmd = 'soffice' + # try: + # result = subprocess.run( + # ["libreoffice", "--version"], + # capture_output=True, + # check=True, + # timeout=10, + # encoding="utf-8", + # errors="ignore", + # ) + # libreoffice_available = True + # working_libreoffice_cmd = "libreoffice" + # print(f"LibreOffice detected: {result.stdout.strip()}") + # except ( + # subprocess.CalledProcessError, + # FileNotFoundError, + # subprocess.TimeoutExpired, + # ): + # pass + # + # # Try alternative commands for LibreOffice + # if not libreoffice_available: + # for cmd in ["soffice", "libreoffice"]: + # try: + # result = subprocess.run( + # [cmd, "--version"], + # capture_output=True, + # check=True, + # timeout=10, + # encoding="utf-8", + # errors="ignore", + # ) + # libreoffice_available = True + # working_libreoffice_cmd = cmd + # print( + # f"LibreOffice detected with command '{cmd}': {result.stdout.strip()}" + # ) + # break + # except ( + # subprocess.CalledProcessError, + # FileNotFoundError, + # subprocess.TimeoutExpired, + # ): + # continue + # + # if not libreoffice_available: + # raise RuntimeError( + # "LibreOffice is required for Office document conversion but was not found.\n" + # "Please install LibreOffice:\n" + # "- Windows: Download from https://www.libreoffice.org/download/download/\n" + # "- macOS: brew install --cask libreoffice\n" + # "- Ubuntu/Debian: sudo apt-get install libreoffice\n" + # "- CentOS/RHEL: sudo yum install libreoffice\n" + # "Alternatively, convert the document to PDF manually.\n" + # "MinerU 2.0 no longer includes built-in Office document conversion." + # ) # Create temporary directory for PDF conversion with tempfile.TemporaryDirectory() as temp_dir: @@ -535,6 +563,7 @@ class MineruParser: if result.returncode == 0: conversion_successful = True print(f"Successfully converted {doc_path.name} to PDF") + print(convert_cmd) break else: print( @@ -572,6 +601,7 @@ class MineruParser: ) # Parse the converted PDF + # TODO return MineruParser.parse_pdf( pdf_path=pdf_path, output_dir=output_dir, **kwargs ) @@ -582,7 +612,7 @@ class MineruParser: @staticmethod def parse_text_file( - text_path: Union[str, Path], output_dir: Optional[str] = None, **kwargs + text_path: Union[str, Path], output_dir: Optional[str] = None, **kwargs ) -> Tuple[List[Dict[str, Any]], str]: """ Parse text file by first converting to PDF, then parsing with MinerU 2.0 @@ -752,9 +782,9 @@ class MineruParser: # Handle tables if ( - "|" in line - and line.strip().startswith("|") - and line.strip().endswith("|") + "|" in line + and line.strip().startswith("|") + and line.strip().endswith("|") ): if not in_table: in_table = True @@ -766,15 +796,15 @@ class MineruParser: # End of table in_table = False if ( - len(table_lines) >= 2 + len(table_lines) >= 2 ): # Need at least header and separator try: # Parse table table_data = [] for table_line in table_lines: if ( - "---" in table_line - or "===" in table_line + "---" in table_line + or "===" in table_line ): continue # Skip separator line cells = [ @@ -1112,11 +1142,11 @@ class MineruParser: @staticmethod def parse_document( - file_path: Union[str, Path], - method: str = "auto", - output_dir: Optional[str] = None, - lang: Optional[str] = None, - **kwargs, + file_path: Union[str, Path], + method: str = "auto", + output_dir: Optional[str] = None, + lang: Optional[str] = None, + **kwargs, ) -> Tuple[List[Dict[str, Any]], str]: """ Parse document using MinerU 2.0 based on file extension