main
HuangHai 3 weeks ago
parent 682cad6910
commit 9f5f1480dc

@ -37,4 +37,7 @@ python examples/text_format_test.py --file path/to/document.md
python examples/image_format_test.py --check-pillow --file dummy
# Check ReportLab installation
python examples/text_format_test.py --check-reportlab --file dummy
python examples/text_format_test.py --check-reportlab --file dummy
# MinerU
https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md

@ -1,47 +1,34 @@
#!/usr/bin/env python3
"""
Office文档解析测试脚本 - RAG-Anything项目
Office Document Parsing Test Script for RAG-Anything
本脚本演示如何使用MinerU解析各种Office文档格式包括
- DOC/DOCX (Word文档)
- PPT/PPTX (PowerPoint演示文稿)
- XLS/XLSX (Excel电子表格)
This script demonstrates how to parse various Office document formats
using MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files.
要求
1. 系统已安装LibreOffice
2. 已安装RAG-Anything包
Requirements:
- LibreOffice installed on the system
- RAG-Anything package
使用方法
python office_document_test.py --file 办公文档路径.docx
Usage:
python office_document_test.py --file path/to/office/document.docx
"""
import argparse
import sys
from pathlib import Path
from raganything import RAGAnything, RAGAnythingConfig
from raganything import RAGAnything
def check_libreoffice_installation():
"""
检查LibreOffice是否已安装并可用
返回:
bool: 如果LibreOffice可用返回True否则返回False
"""
"""
"""Check if LibreOffice is installed and available"""
import subprocess
# 尝试不同的LibreOffice命令名称
for cmd in ["libreoffice", "soffice"]:
try:
result = subprocess.run(
[cmd, "--version"],
capture_output=True,
check=True,
timeout=10
[cmd, "--version"], capture_output=True, check=True, timeout=10
)
print(f"找到LibreOffice: {result.stdout.decode().strip()}")
print(f"✅ LibreOffice found: {result.stdout.decode().strip()}")
return True
except (
subprocess.CalledProcessError,
@ -50,97 +37,52 @@ def check_libreoffice_installation():
):
continue
# 如果未找到LibreOffice显示安装指南
print("❌ 未找到LibreOffice. 请安装LibreOffice:")
print(" - Windows: 从 https://www.libreoffice.org/download/download/ 下载")
print("❌ LibreOffice not found. Please install LibreOffice:")
print(" - Windows: Download from https://www.libreoffice.org/download/download/")
print(" - macOS: brew install --cask libreoffice")
print(" - Ubuntu/Debian: sudo apt-get install libreoffice")
print(" - CentOS/RHEL: sudo yum install libreoffice")
return False
"""
return True
def test_office_document_parsing(file_path: str):
"""
测试Office文档解析功能
参数:
file_path (str): 要测试的Office文档路径
返回:
bool: 解析成功返回True否则返回False
"""
# 在test_office_document_parsing函数中添加
import os
os.environ["LIBREOFFICE_PATH"] = "C:\\Program Files\\LibreOffice\\program"
print(f"🧪 测试Office文档解析: {file_path}")
# 检查文件是否存在且是支持的Office格式
"""Test Office document parsing with MinerU"""
print(f"🧪 Testing Office document parsing: {file_path}")
# Check if file exists and is a supported Office format
file_path = Path(file_path)
if not file_path.exists():
print(f"文件不存在: {file_path}")
print(f"❌ File does not exist: {file_path}")
return False
# 支持的文档扩展名列表
supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
if file_path.suffix.lower() not in supported_extensions:
print(f"不支持的文档格式: {file_path.suffix}")
print(f" 支持的格式: {', '.join(supported_extensions)}")
print(f"❌ Unsupported file format: {file_path.suffix}")
print(f" Supported formats: {', '.join(supported_extensions)}")
return False
# 显示文档基本信息
print(f"📄 文档格式: {file_path.suffix.upper()}")
print(f"📏 文件大小: {file_path.stat().st_size / 1024:.1f} KB")
print(f"📄 File format: {file_path.suffix.upper()}")
print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")
# 初始化RAGAnything(仅用于解析功能)
from raganything.config import RAGAnythingConfig
config = RAGAnythingConfig(working_dir="./temp_parsing_test")
rag = RAGAnything(config=config)
# Initialize RAGAnything (only for parsing functionality)
rag = RAGAnything()
try:
# 添加MinerU安装检查
from raganything.mineru_parser import MineruParser
if not MineruParser.check_installation():
print("❌ MinerU未正确安装")
return False
# 确保output_dir已定义
output_dir = "./test_output"
Path(output_dir).mkdir(exist_ok=True)
# 添加PDF转换检查
pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
print(f"PDF转换路径: {pdf_path}")
if pdf_path.exists():
print(f"✅ PDF已生成大小: {pdf_path.stat().st_size}字节")
else:
print("❌ PDF转换失败")
# 使用MinerU测试文档解析
print("\n🔄 使用MinerU测试文档解析...")
# 使用绝对路径确保输出目录位置明确
output_dir = Path("d:/dsWork/dsProject/dsRagAnything/Tools/test_output").absolute()
output_dir.mkdir(exist_ok=True)
# Test document parsing with MinerU
print("\n🔄 Testing document parsing with MinerU...")
content_list, md_content = rag.parse_document(
file_path=str(file_path),
output_dir=str(output_dir),
output_dir="./test_output",
parse_method="auto",
display_stats=True,
)
# 检查输出目录内容
print(f"\n📂 输出目录内容({output_dir}):")
for f in output_dir.glob("*"):
print(f" - {f.name}")
print("✅ 解析成功!")
print(f" 📊 内容块数量: {len(content_list)}")
print(f" 📝 Markdown长度: {len(md_content)} 字符")
# 分析内容类型分布
print("✅ Parsing successful!")
print(f" 📊 Content blocks: {len(content_list)}")
print(f" 📝 Markdown length: {len(md_content)} characters")
# Analyze content types
content_types = {}
for item in content_list:
if isinstance(item, dict):
@ -148,24 +90,24 @@ def test_office_document_parsing(file_path: str):
content_types[content_type] = content_types.get(content_type, 0) + 1
if content_types:
print(" 📋 内容类型分布:")
print(" 📋 Content distribution:")
for content_type, count in sorted(content_types.items()):
print(f"{content_type}: {count}")
# 显示解析内容预览
# Display some parsed content preview
if md_content.strip():
print("\n📄 解析内容预览(前500字符):")
print("\n📄 Parsed content preview (first 500 characters):")
preview = md_content.strip()[:500]
print(f" {preview}{'...' if len(md_content) > 500 else ''}")
# 显示文本块示例
# Display some structured content examples
text_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "text"
]
if text_items:
print("\n📝 文本块示例:")
print("\n📝 Sample text blocks:")
for i, item in enumerate(text_items[:3], 1):
text_content = item.get("text", "")
if text_content.strip():
@ -174,73 +116,56 @@ def test_office_document_parsing(file_path: str):
f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"
)
# 检查图片内容
# Check for images
image_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "image"
]
if image_items:
print(f"\n🖼️ 找到 {len(image_items)} 张图片:")
print(f"\n🖼️ Found {len(image_items)} image(s):")
for i, item in enumerate(image_items, 1):
print(f" {i}. 图片路径: {item.get('img_path', 'N/A')}")
print(f" {i}. Image path: {item.get('img_path', 'N/A')}")
# 检查表格内容
# Check for tables
table_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "table"
]
if table_items:
print(f"\n📊 找到 {len(table_items)} 个表格:")
print(f"\n📊 Found {len(table_items)} table(s):")
for i, item in enumerate(table_items, 1):
table_body = item.get("table_body", "")
row_count = len(table_body.split("\n"))
print(f" {i}. 包含 {row_count} 行的表格")
print(f" {i}. Table with {row_count} rows")
print("\n🎉 Office文档解析测试成功完成!")
print("📁 输出文件保存到: ./test_output")
print("\n🎉 Office document parsing test completed successfully!")
print("📁 Output files saved to: ./test_output")
return True
# 在test_office_document_parsing函数中添加
pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
print(f"PDF文件内容预览(前100字符): {pdf_path.read_text()[:100]}")
except Exception as e:
print(f"\n❌ Office文档解析失败: {str(e)}")
print(f"\n❌ Office document parsing failed: {str(e)}")
import traceback
print(f" 完整错误: {traceback.format_exc()}")
print(f" Full error: {traceback.format_exc()}")
return False
def main():
"""
主函数
处理命令行参数并执行测试
"""
# 固定文档路径
file_path = r"../Txt/小学数学教学中的若干问题_MATH_1.docx"
# 检查LibreOffice安装
print("🔧 检查LibreOffice安装状态...")
if not check_libreoffice_installation():
return 1
file=r"D:\dsWork\dsProject\dsRagAnything\Txt\小学数学教学中的若干问题_MATH_1.docx"
# 运行解析测试
# Run the parsing test
try:
success = test_office_document_parsing(file_path)
success = test_office_document_parsing(file)
return 0 if success else 1
except KeyboardInterrupt:
print("\n⏹️ 测试被用户中断")
print("\n⏹️ Test interrupted by user")
return 1
except Exception as e:
print(f"\n发生意外错误: {str(e)}")
print(f"\n❌ Unexpected error: {str(e)}")
return 1
if __name__ == "__main__":
sys.exit(main())
sys.exit(main())

@ -19,31 +19,6 @@ from pathlib import Path
from raganything import RAGAnything
def check_libreoffice_installation():
"""Check if LibreOffice is installed and available"""
import subprocess
for cmd in ["libreoffice", "soffice"]:
try:
result = subprocess.run(
[cmd, "--version"], capture_output=True, check=True, timeout=10
)
print(f"✅ LibreOffice found: {result.stdout.decode().strip()}")
return True
except (
subprocess.CalledProcessError,
FileNotFoundError,
subprocess.TimeoutExpired,
):
continue
print("❌ LibreOffice not found. Please install LibreOffice:")
print(" - Windows: Download from https://www.libreoffice.org/download/download/")
print(" - macOS: brew install --cask libreoffice")
print(" - Ubuntu/Debian: sudo apt-get install libreoffice")
print(" - CentOS/RHEL: sudo yum install libreoffice")
return False
def test_office_document_parsing(file_path: str):
"""Test Office document parsing with MinerU"""
@ -168,10 +143,7 @@ def main():
args = parser.parse_args()
# Check LibreOffice installation
print("🔧 Checking LibreOffice installation...")
if not check_libreoffice_installation():
return 1
if args.check_libreoffice:
print("✅ LibreOffice installation check passed!")

@ -49,17 +49,17 @@ class MineruParser:
@staticmethod
def _run_mineru_command(
input_path: Union[str, Path],
output_dir: Union[str, Path],
method: str = "auto",
lang: Optional[str] = None,
backend: str = "pipeline",
start_page: Optional[int] = None,
end_page: Optional[int] = None,
formula: bool = True,
table: bool = True,
device: Optional[str] = None,
source: str = "huggingface",
input_path: Union[str, Path],
output_dir: Union[str, Path],
method: str = "auto",
lang: Optional[str] = None,
backend: str = "pipeline",
start_page: Optional[int] = None,
end_page: Optional[int] = None,
formula: bool = True,
table: bool = True,
device: Optional[str] = None,
source: str = "huggingface",
) -> None:
"""
Run mineru command line tool
@ -77,6 +77,34 @@ class MineruParser:
device: Inference device
source: Model source
"""
# 【黄海】 MinerU需要下载模型可以从国内的源下载
# https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#本地部署
# mineru-models-download
# 居然在下载 OCR/paddleocr_torch/ 果然是个好东西!
"""
(raganything) PS D:\dsWork\dsProject\dsRagAnything> mineru-models-download
Please select the model download source: (huggingface, modelscope) [huggingface]: modelscope
Please select the model type to download: (pipeline, vlm, all) [all]: all
Downloading all model from modelscope...
Downloading model: models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
Download failed: Missing dependencies for SOCKS support.
(raganything) PS D:\dsWork\dsProject\dsRagAnything> mineru-models-download
Please select the model download source: (huggingface, modelscope) [huggingface]: modelscope
Please select the model type to download: (pipeline, vlm, all) [all]: all
Downloading all model from modelscope...
Downloading model: models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
2025-07-04 21:46:23,860 - modelscope - INFO - Got 1 files, start to download ...
Downloading [models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt]: 100%|| 37.9M/37.9M [00:02<00:00, 15.
Processing 1 items: 100%|| 1.00/1.00 [00:02<00:00, 2.64s/it]
2025-07-04 21:46:26,507 - modelscope - INFO - Download model 'OpenDataLab/PDF-Extract-Kit-1.0' successfully.
2025-07-04 21:46:26,507 - modelscope - INFO - Creating symbolic link [C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0].
Downloading model: models/MFD/YOLO/yolo_v8_ft.pt
Downloading Model from https://www.modelscope.cn to directory: C:\Users\Administrator\.cache\modelscope\hub\models\OpenDataLab\PDF-Extract-Kit-1.0
2025-07-04 21:46:29,616 - modelscope - INFO - Got 1 files, start to download ...
Processing 1 items: 0%| | 0.00/1.00 [00:00<?, ?it/s]
Downloading [models/MFD/YOLO/yolo_v8_ft.pt]: 31%| | 104M/334M [00:06<00:13, 17.7MB/s]
"""
cmd = [
"mineru",
"-p",
@ -129,7 +157,7 @@ class MineruParser:
@staticmethod
def _read_output_files(
output_dir: Path, file_stem: str
output_dir: Path, file_stem: str
) -> Tuple[List[Dict[str, Any]], str]:
"""
Read the output files generated by mineru
@ -197,11 +225,11 @@ class MineruParser:
@staticmethod
def parse_pdf(
pdf_path: Union[str, Path],
output_dir: Optional[str] = None,
method: str = "auto",
lang: Optional[str] = None,
**kwargs,
pdf_path: Union[str, Path],
output_dir: Optional[str] = None,
method: str = "auto",
lang: Optional[str] = None,
**kwargs,
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse PDF document using MinerU 2.0
@ -254,10 +282,10 @@ class MineruParser:
@staticmethod
def parse_image(
image_path: Union[str, Path],
output_dir: Optional[str] = None,
lang: Optional[str] = None,
**kwargs,
image_path: Union[str, Path],
output_dir: Optional[str] = None,
lang: Optional[str] = None,
**kwargs,
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse image document using MinerU 2.0
@ -402,7 +430,7 @@ class MineruParser:
@staticmethod
def parse_office_doc(
doc_path: Union[str, Path], output_dir: Optional[str] = None, **kwargs
doc_path: Union[str, Path], output_dir: Optional[str] = None, **kwargs
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse office document by first converting to PDF, then parsing with MinerU 2.0
@ -437,64 +465,64 @@ class MineruParser:
if doc_path.suffix.lower() not in supported_office_formats:
raise ValueError(f"Unsupported office format: {doc_path.suffix}")
# Check if LibreOffice is available
libreoffice_available = False
working_libreoffice_cmd = None
try:
result = subprocess.run(
["libreoffice", "--version"],
capture_output=True,
check=True,
timeout=10,
encoding="utf-8",
errors="ignore",
)
libreoffice_available = True
working_libreoffice_cmd = "libreoffice"
print(f"LibreOffice detected: {result.stdout.strip()}")
except (
subprocess.CalledProcessError,
FileNotFoundError,
subprocess.TimeoutExpired,
):
pass
# Try alternative commands for LibreOffice
if not libreoffice_available:
for cmd in ["soffice", "libreoffice"]:
try:
result = subprocess.run(
[cmd, "--version"],
capture_output=True,
check=True,
timeout=10,
encoding="utf-8",
errors="ignore",
)
libreoffice_available = True
working_libreoffice_cmd = cmd
print(
f"LibreOffice detected with command '{cmd}': {result.stdout.strip()}"
)
break
except (
subprocess.CalledProcessError,
FileNotFoundError,
subprocess.TimeoutExpired,
):
continue
if not libreoffice_available:
raise RuntimeError(
"LibreOffice is required for Office document conversion but was not found.\n"
"Please install LibreOffice:\n"
"- Windows: Download from https://www.libreoffice.org/download/download/\n"
"- macOS: brew install --cask libreoffice\n"
"- Ubuntu/Debian: sudo apt-get install libreoffice\n"
"- CentOS/RHEL: sudo yum install libreoffice\n"
"Alternatively, convert the document to PDF manually.\n"
"MinerU 2.0 no longer includes built-in Office document conversion."
)
# # Check if LibreOffice is available
# libreoffice_available = False
working_libreoffice_cmd = 'soffice'
# try:
# result = subprocess.run(
# ["libreoffice", "--version"],
# capture_output=True,
# check=True,
# timeout=10,
# encoding="utf-8",
# errors="ignore",
# )
# libreoffice_available = True
# working_libreoffice_cmd = "libreoffice"
# print(f"LibreOffice detected: {result.stdout.strip()}")
# except (
# subprocess.CalledProcessError,
# FileNotFoundError,
# subprocess.TimeoutExpired,
# ):
# pass
#
# # Try alternative commands for LibreOffice
# if not libreoffice_available:
# for cmd in ["soffice", "libreoffice"]:
# try:
# result = subprocess.run(
# [cmd, "--version"],
# capture_output=True,
# check=True,
# timeout=10,
# encoding="utf-8",
# errors="ignore",
# )
# libreoffice_available = True
# working_libreoffice_cmd = cmd
# print(
# f"LibreOffice detected with command '{cmd}': {result.stdout.strip()}"
# )
# break
# except (
# subprocess.CalledProcessError,
# FileNotFoundError,
# subprocess.TimeoutExpired,
# ):
# continue
#
# if not libreoffice_available:
# raise RuntimeError(
# "LibreOffice is required for Office document conversion but was not found.\n"
# "Please install LibreOffice:\n"
# "- Windows: Download from https://www.libreoffice.org/download/download/\n"
# "- macOS: brew install --cask libreoffice\n"
# "- Ubuntu/Debian: sudo apt-get install libreoffice\n"
# "- CentOS/RHEL: sudo yum install libreoffice\n"
# "Alternatively, convert the document to PDF manually.\n"
# "MinerU 2.0 no longer includes built-in Office document conversion."
# )
# Create temporary directory for PDF conversion
with tempfile.TemporaryDirectory() as temp_dir:
@ -535,6 +563,7 @@ class MineruParser:
if result.returncode == 0:
conversion_successful = True
print(f"Successfully converted {doc_path.name} to PDF")
print(convert_cmd)
break
else:
print(
@ -572,6 +601,7 @@ class MineruParser:
)
# Parse the converted PDF
# TODO
return MineruParser.parse_pdf(
pdf_path=pdf_path, output_dir=output_dir, **kwargs
)
@ -582,7 +612,7 @@ class MineruParser:
@staticmethod
def parse_text_file(
text_path: Union[str, Path], output_dir: Optional[str] = None, **kwargs
text_path: Union[str, Path], output_dir: Optional[str] = None, **kwargs
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse text file by first converting to PDF, then parsing with MinerU 2.0
@ -752,9 +782,9 @@ class MineruParser:
# Handle tables
if (
"|" in line
and line.strip().startswith("|")
and line.strip().endswith("|")
"|" in line
and line.strip().startswith("|")
and line.strip().endswith("|")
):
if not in_table:
in_table = True
@ -766,15 +796,15 @@ class MineruParser:
# End of table
in_table = False
if (
len(table_lines) >= 2
len(table_lines) >= 2
): # Need at least header and separator
try:
# Parse table
table_data = []
for table_line in table_lines:
if (
"---" in table_line
or "===" in table_line
"---" in table_line
or "===" in table_line
):
continue # Skip separator line
cells = [
@ -1112,11 +1142,11 @@ class MineruParser:
@staticmethod
def parse_document(
file_path: Union[str, Path],
method: str = "auto",
output_dir: Optional[str] = None,
lang: Optional[str] = None,
**kwargs,
file_path: Union[str, Path],
method: str = "auto",
output_dir: Optional[str] = None,
lang: Optional[str] = None,
**kwargs,
) -> Tuple[List[Dict[str, Any]], str]:
"""
Parse document using MinerU 2.0 based on file extension

Loading…
Cancel
Save