|
|
|
@ -1,47 +1,34 @@
|
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
"""
|
|
|
|
|
Office文档解析测试脚本 - RAG-Anything项目
|
|
|
|
|
Office Document Parsing Test Script for RAG-Anything
|
|
|
|
|
|
|
|
|
|
本脚本演示如何使用MinerU解析各种Office文档格式,包括:
|
|
|
|
|
- DOC/DOCX (Word文档)
|
|
|
|
|
- PPT/PPTX (PowerPoint演示文稿)
|
|
|
|
|
- XLS/XLSX (Excel电子表格)
|
|
|
|
|
This script demonstrates how to parse various Office document formats
|
|
|
|
|
using MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files.
|
|
|
|
|
|
|
|
|
|
要求:
|
|
|
|
|
1. 系统已安装LibreOffice
|
|
|
|
|
2. 已安装RAG-Anything包
|
|
|
|
|
Requirements:
|
|
|
|
|
- LibreOffice installed on the system
|
|
|
|
|
- RAG-Anything package
|
|
|
|
|
|
|
|
|
|
使用方法:
|
|
|
|
|
python office_document_test.py --file 办公文档路径.docx
|
|
|
|
|
Usage:
|
|
|
|
|
python office_document_test.py --file path/to/office/document.docx
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
import sys
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from raganything import RAGAnything, RAGAnythingConfig
|
|
|
|
|
from raganything import RAGAnything
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check_libreoffice_installation():
|
|
|
|
|
"""
|
|
|
|
|
检查LibreOffice是否已安装并可用
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
bool: 如果LibreOffice可用返回True,否则返回False
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
"""Check if LibreOffice is installed and available"""
|
|
|
|
|
import subprocess
|
|
|
|
|
|
|
|
|
|
# 尝试不同的LibreOffice命令名称
|
|
|
|
|
for cmd in ["libreoffice", "soffice"]:
|
|
|
|
|
try:
|
|
|
|
|
result = subprocess.run(
|
|
|
|
|
[cmd, "--version"],
|
|
|
|
|
capture_output=True,
|
|
|
|
|
check=True,
|
|
|
|
|
timeout=10
|
|
|
|
|
[cmd, "--version"], capture_output=True, check=True, timeout=10
|
|
|
|
|
)
|
|
|
|
|
print(f"✅ 找到LibreOffice: {result.stdout.decode().strip()}")
|
|
|
|
|
print(f"✅ LibreOffice found: {result.stdout.decode().strip()}")
|
|
|
|
|
return True
|
|
|
|
|
except (
|
|
|
|
|
subprocess.CalledProcessError,
|
|
|
|
@ -50,97 +37,52 @@ def check_libreoffice_installation():
|
|
|
|
|
):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 如果未找到LibreOffice,显示安装指南
|
|
|
|
|
print("❌ 未找到LibreOffice. 请安装LibreOffice:")
|
|
|
|
|
print(" - Windows: 从 https://www.libreoffice.org/download/download/ 下载")
|
|
|
|
|
print("❌ LibreOffice not found. Please install LibreOffice:")
|
|
|
|
|
print(" - Windows: Download from https://www.libreoffice.org/download/download/")
|
|
|
|
|
print(" - macOS: brew install --cask libreoffice")
|
|
|
|
|
print(" - Ubuntu/Debian: sudo apt-get install libreoffice")
|
|
|
|
|
print(" - CentOS/RHEL: sudo yum install libreoffice")
|
|
|
|
|
return False
|
|
|
|
|
"""
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_office_document_parsing(file_path: str):
|
|
|
|
|
"""
|
|
|
|
|
测试Office文档解析功能
|
|
|
|
|
"""Test Office document parsing with MinerU"""
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
file_path (str): 要测试的Office文档路径
|
|
|
|
|
print(f"🧪 Testing Office document parsing: {file_path}")
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
bool: 解析成功返回True,否则返回False
|
|
|
|
|
"""
|
|
|
|
|
# 在test_office_document_parsing函数中添加
|
|
|
|
|
import os
|
|
|
|
|
os.environ["LIBREOFFICE_PATH"] = "C:\\Program Files\\LibreOffice\\program"
|
|
|
|
|
print(f"🧪 测试Office文档解析: {file_path}")
|
|
|
|
|
|
|
|
|
|
# 检查文件是否存在且是支持的Office格式
|
|
|
|
|
# Check if file exists and is a supported Office format
|
|
|
|
|
file_path = Path(file_path)
|
|
|
|
|
if not file_path.exists():
|
|
|
|
|
print(f"❌ 文件不存在: {file_path}")
|
|
|
|
|
print(f"❌ File does not exist: {file_path}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 支持的文档扩展名列表
|
|
|
|
|
supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
|
|
|
|
|
if file_path.suffix.lower() not in supported_extensions:
|
|
|
|
|
print(f"❌ 不支持的文档格式: {file_path.suffix}")
|
|
|
|
|
print(f" 支持的格式: {', '.join(supported_extensions)}")
|
|
|
|
|
print(f"❌ Unsupported file format: {file_path.suffix}")
|
|
|
|
|
print(f" Supported formats: {', '.join(supported_extensions)}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 显示文档基本信息
|
|
|
|
|
print(f"📄 文档格式: {file_path.suffix.upper()}")
|
|
|
|
|
print(f"📏 文件大小: {file_path.stat().st_size / 1024:.1f} KB")
|
|
|
|
|
print(f"📄 File format: {file_path.suffix.upper()}")
|
|
|
|
|
print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")
|
|
|
|
|
|
|
|
|
|
# 初始化RAGAnything(仅用于解析功能)
|
|
|
|
|
from raganything.config import RAGAnythingConfig
|
|
|
|
|
config = RAGAnythingConfig(working_dir="./temp_parsing_test")
|
|
|
|
|
rag = RAGAnything(config=config)
|
|
|
|
|
# Initialize RAGAnything (only for parsing functionality)
|
|
|
|
|
rag = RAGAnything()
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 添加MinerU安装检查
|
|
|
|
|
from raganything.mineru_parser import MineruParser
|
|
|
|
|
if not MineruParser.check_installation():
|
|
|
|
|
print("❌ MinerU未正确安装")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 确保output_dir已定义
|
|
|
|
|
output_dir = "./test_output"
|
|
|
|
|
Path(output_dir).mkdir(exist_ok=True)
|
|
|
|
|
|
|
|
|
|
# 添加PDF转换检查
|
|
|
|
|
pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
|
|
|
|
|
print(f"PDF转换路径: {pdf_path}")
|
|
|
|
|
if pdf_path.exists():
|
|
|
|
|
print(f"✅ PDF已生成,大小: {pdf_path.stat().st_size}字节")
|
|
|
|
|
else:
|
|
|
|
|
print("❌ PDF转换失败")
|
|
|
|
|
|
|
|
|
|
# 使用MinerU测试文档解析
|
|
|
|
|
print("\n🔄 使用MinerU测试文档解析...")
|
|
|
|
|
|
|
|
|
|
# 使用绝对路径确保输出目录位置明确
|
|
|
|
|
output_dir = Path("d:/dsWork/dsProject/dsRagAnything/Tools/test_output").absolute()
|
|
|
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
|
|
|
|
|
|
# Test document parsing with MinerU
|
|
|
|
|
print("\n🔄 Testing document parsing with MinerU...")
|
|
|
|
|
content_list, md_content = rag.parse_document(
|
|
|
|
|
file_path=str(file_path),
|
|
|
|
|
output_dir=str(output_dir),
|
|
|
|
|
output_dir="./test_output",
|
|
|
|
|
parse_method="auto",
|
|
|
|
|
display_stats=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 检查输出目录内容
|
|
|
|
|
print(f"\n📂 输出目录内容({output_dir}):")
|
|
|
|
|
for f in output_dir.glob("*"):
|
|
|
|
|
print(f" - {f.name}")
|
|
|
|
|
|
|
|
|
|
print("✅ 解析成功!")
|
|
|
|
|
print(f" 📊 内容块数量: {len(content_list)}")
|
|
|
|
|
print(f" 📝 Markdown长度: {len(md_content)} 字符")
|
|
|
|
|
print("✅ Parsing successful!")
|
|
|
|
|
print(f" 📊 Content blocks: {len(content_list)}")
|
|
|
|
|
print(f" 📝 Markdown length: {len(md_content)} characters")
|
|
|
|
|
|
|
|
|
|
# 分析内容类型分布
|
|
|
|
|
# Analyze content types
|
|
|
|
|
content_types = {}
|
|
|
|
|
for item in content_list:
|
|
|
|
|
if isinstance(item, dict):
|
|
|
|
@ -148,24 +90,24 @@ def test_office_document_parsing(file_path: str):
|
|
|
|
|
content_types[content_type] = content_types.get(content_type, 0) + 1
|
|
|
|
|
|
|
|
|
|
if content_types:
|
|
|
|
|
print(" 📋 内容类型分布:")
|
|
|
|
|
print(" 📋 Content distribution:")
|
|
|
|
|
for content_type, count in sorted(content_types.items()):
|
|
|
|
|
print(f" • {content_type}: {count}")
|
|
|
|
|
|
|
|
|
|
# 显示解析内容预览
|
|
|
|
|
# Display some parsed content preview
|
|
|
|
|
if md_content.strip():
|
|
|
|
|
print("\n📄 解析内容预览(前500字符):")
|
|
|
|
|
print("\n📄 Parsed content preview (first 500 characters):")
|
|
|
|
|
preview = md_content.strip()[:500]
|
|
|
|
|
print(f" {preview}{'...' if len(md_content) > 500 else ''}")
|
|
|
|
|
|
|
|
|
|
# 显示文本块示例
|
|
|
|
|
# Display some structured content examples
|
|
|
|
|
text_items = [
|
|
|
|
|
item
|
|
|
|
|
for item in content_list
|
|
|
|
|
if isinstance(item, dict) and item.get("type") == "text"
|
|
|
|
|
]
|
|
|
|
|
if text_items:
|
|
|
|
|
print("\n📝 文本块示例:")
|
|
|
|
|
print("\n📝 Sample text blocks:")
|
|
|
|
|
for i, item in enumerate(text_items[:3], 1):
|
|
|
|
|
text_content = item.get("text", "")
|
|
|
|
|
if text_content.strip():
|
|
|
|
@ -174,73 +116,56 @@ def test_office_document_parsing(file_path: str):
|
|
|
|
|
f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 检查图片内容
|
|
|
|
|
# Check for images
|
|
|
|
|
image_items = [
|
|
|
|
|
item
|
|
|
|
|
for item in content_list
|
|
|
|
|
if isinstance(item, dict) and item.get("type") == "image"
|
|
|
|
|
]
|
|
|
|
|
if image_items:
|
|
|
|
|
print(f"\n🖼️ 找到 {len(image_items)} 张图片:")
|
|
|
|
|
print(f"\n🖼️ Found {len(image_items)} image(s):")
|
|
|
|
|
for i, item in enumerate(image_items, 1):
|
|
|
|
|
print(f" {i}. 图片路径: {item.get('img_path', 'N/A')}")
|
|
|
|
|
print(f" {i}. Image path: {item.get('img_path', 'N/A')}")
|
|
|
|
|
|
|
|
|
|
# 检查表格内容
|
|
|
|
|
# Check for tables
|
|
|
|
|
table_items = [
|
|
|
|
|
item
|
|
|
|
|
for item in content_list
|
|
|
|
|
if isinstance(item, dict) and item.get("type") == "table"
|
|
|
|
|
]
|
|
|
|
|
if table_items:
|
|
|
|
|
print(f"\n📊 找到 {len(table_items)} 个表格:")
|
|
|
|
|
print(f"\n📊 Found {len(table_items)} table(s):")
|
|
|
|
|
for i, item in enumerate(table_items, 1):
|
|
|
|
|
table_body = item.get("table_body", "")
|
|
|
|
|
row_count = len(table_body.split("\n"))
|
|
|
|
|
print(f" {i}. 包含 {row_count} 行的表格")
|
|
|
|
|
print(f" {i}. Table with {row_count} rows")
|
|
|
|
|
|
|
|
|
|
print("\n🎉 Office文档解析测试成功完成!")
|
|
|
|
|
print("📁 输出文件保存到: ./test_output")
|
|
|
|
|
print("\n🎉 Office document parsing test completed successfully!")
|
|
|
|
|
print("📁 Output files saved to: ./test_output")
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
# 在test_office_document_parsing函数中添加
|
|
|
|
|
pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
|
|
|
|
|
print(f"PDF文件内容预览(前100字符): {pdf_path.read_text()[:100]}")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"\n❌ Office文档解析失败: {str(e)}")
|
|
|
|
|
print(f"\n❌ Office document parsing failed: {str(e)}")
|
|
|
|
|
import traceback
|
|
|
|
|
|
|
|
|
|
print(f" 完整错误: {traceback.format_exc()}")
|
|
|
|
|
print(f" Full error: {traceback.format_exc()}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
"""
|
|
|
|
|
主函数
|
|
|
|
|
|
|
|
|
|
处理命令行参数并执行测试
|
|
|
|
|
"""
|
|
|
|
|
# 固定文档路径
|
|
|
|
|
file_path = r"../Txt/小学数学教学中的若干问题_MATH_1.docx"
|
|
|
|
|
file=r"D:\dsWork\dsProject\dsRagAnything\Txt\小学数学教学中的若干问题_MATH_1.docx"
|
|
|
|
|
|
|
|
|
|
# 检查LibreOffice安装
|
|
|
|
|
print("🔧 检查LibreOffice安装状态...")
|
|
|
|
|
if not check_libreoffice_installation():
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
# 运行解析测试
|
|
|
|
|
# Run the parsing test
|
|
|
|
|
try:
|
|
|
|
|
success = test_office_document_parsing(file_path)
|
|
|
|
|
success = test_office_document_parsing(file)
|
|
|
|
|
return 0 if success else 1
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
print("\n⏹️ 测试被用户中断")
|
|
|
|
|
print("\n⏹️ Test interrupted by user")
|
|
|
|
|
return 1
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"\n❌ 发生意外错误: {str(e)}")
|
|
|
|
|
print(f"\n❌ Unexpected error: {str(e)}")
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
sys.exit(main())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|