You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

247 lines
7.9 KiB

3 weeks ago
#!/usr/bin/env python3
"""
3 weeks ago
Office文档解析测试脚本 - RAG-Anything项目
3 weeks ago
3 weeks ago
本脚本演示如何使用MinerU解析各种Office文档格式包括
- DOC/DOCX (Word文档)
- PPT/PPTX (PowerPoint演示文稿)
- XLS/XLSX (Excel电子表格)
3 weeks ago
3 weeks ago
要求
1. 系统已安装LibreOffice
2. 已安装RAG-Anything包
3 weeks ago
3 weeks ago
使用方法
python office_document_test.py --file 办公文档路径.docx
3 weeks ago
"""
import argparse
import sys
from pathlib import Path
3 weeks ago
from raganything import RAGAnything, RAGAnythingConfig
3 weeks ago
def check_libreoffice_installation():
3 weeks ago
"""
检查LibreOffice是否已安装并可用
返回:
bool: 如果LibreOffice可用返回True否则返回False
"""
3 weeks ago
"""
3 weeks ago
import subprocess
3 weeks ago
# 尝试不同的LibreOffice命令名称
3 weeks ago
for cmd in ["libreoffice", "soffice"]:
try:
result = subprocess.run(
3 weeks ago
[cmd, "--version"],
capture_output=True,
check=True,
timeout=10
3 weeks ago
)
3 weeks ago
print(f"✅ 找到LibreOffice: {result.stdout.decode().strip()}")
3 weeks ago
return True
except (
subprocess.CalledProcessError,
FileNotFoundError,
subprocess.TimeoutExpired,
):
continue
3 weeks ago
# 如果未找到LibreOffice显示安装指南
print("❌ 未找到LibreOffice. 请安装LibreOffice:")
print(" - Windows: 从 https://www.libreoffice.org/download/download/ 下载")
3 weeks ago
print(" - macOS: brew install --cask libreoffice")
print(" - Ubuntu/Debian: sudo apt-get install libreoffice")
print(" - CentOS/RHEL: sudo yum install libreoffice")
return False
3 weeks ago
"""
return True
3 weeks ago
def test_office_document_parsing(file_path: str):
3 weeks ago
"""
测试Office文档解析功能
参数:
file_path (str): 要测试的Office文档路径
返回:
bool: 解析成功返回True否则返回False
"""
3 weeks ago
# 在test_office_document_parsing函数中添加
import os
os.environ["LIBREOFFICE_PATH"] = "C:\\Program Files\\LibreOffice\\program"
3 weeks ago
print(f"🧪 测试Office文档解析: {file_path}")
# 检查文件是否存在且是支持的Office格式
3 weeks ago
file_path = Path(file_path)
if not file_path.exists():
3 weeks ago
print(f"❌ 文件不存在: {file_path}")
3 weeks ago
return False
3 weeks ago
# 支持的文档扩展名列表
3 weeks ago
supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
if file_path.suffix.lower() not in supported_extensions:
3 weeks ago
print(f"❌ 不支持的文档格式: {file_path.suffix}")
print(f" 支持的格式: {', '.join(supported_extensions)}")
3 weeks ago
return False
3 weeks ago
# 显示文档基本信息
print(f"📄 文档格式: {file_path.suffix.upper()}")
print(f"📏 文件大小: {file_path.stat().st_size / 1024:.1f} KB")
3 weeks ago
3 weeks ago
# 初始化RAGAnything(仅用于解析功能)
from raganything.config import RAGAnythingConfig
config = RAGAnythingConfig(working_dir="./temp_parsing_test")
rag = RAGAnything(config=config)
3 weeks ago
try:
3 weeks ago
# 添加MinerU安装检查
from raganything.mineru_parser import MineruParser
if not MineruParser.check_installation():
print("❌ MinerU未正确安装")
return False
# 确保output_dir已定义
output_dir = "./test_output"
Path(output_dir).mkdir(exist_ok=True)
# 添加PDF转换检查
pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
print(f"PDF转换路径: {pdf_path}")
if pdf_path.exists():
print(f"✅ PDF已生成大小: {pdf_path.stat().st_size}字节")
else:
print("❌ PDF转换失败")
3 weeks ago
# 使用MinerU测试文档解析
print("\n🔄 使用MinerU测试文档解析...")
3 weeks ago
# 使用绝对路径确保输出目录位置明确
output_dir = Path("d:/dsWork/dsProject/dsRagAnything/Tools/test_output").absolute()
output_dir.mkdir(exist_ok=True)
3 weeks ago
content_list, md_content = rag.parse_document(
file_path=str(file_path),
3 weeks ago
output_dir=str(output_dir),
3 weeks ago
parse_method="auto",
display_stats=True,
)
3 weeks ago
# 检查输出目录内容
print(f"\n📂 输出目录内容({output_dir}):")
for f in output_dir.glob("*"):
print(f" - {f.name}")
3 weeks ago
print("✅ 解析成功!")
print(f" 📊 内容块数量: {len(content_list)}")
print(f" 📝 Markdown长度: {len(md_content)} 字符")
3 weeks ago
3 weeks ago
# 分析内容类型分布
3 weeks ago
content_types = {}
for item in content_list:
if isinstance(item, dict):
content_type = item.get("type", "unknown")
content_types[content_type] = content_types.get(content_type, 0) + 1
if content_types:
3 weeks ago
print(" 📋 内容类型分布:")
3 weeks ago
for content_type, count in sorted(content_types.items()):
print(f"{content_type}: {count}")
3 weeks ago
# 显示解析内容预览
3 weeks ago
if md_content.strip():
3 weeks ago
print("\n📄 解析内容预览(前500字符):")
3 weeks ago
preview = md_content.strip()[:500]
print(f" {preview}{'...' if len(md_content) > 500 else ''}")
3 weeks ago
# 显示文本块示例
3 weeks ago
text_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "text"
]
if text_items:
3 weeks ago
print("\n📝 文本块示例:")
3 weeks ago
for i, item in enumerate(text_items[:3], 1):
text_content = item.get("text", "")
if text_content.strip():
preview = text_content.strip()[:200]
print(
f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"
)
3 weeks ago
# 检查图片内容
3 weeks ago
image_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "image"
]
if image_items:
3 weeks ago
print(f"\n🖼️ 找到 {len(image_items)} 张图片:")
3 weeks ago
for i, item in enumerate(image_items, 1):
3 weeks ago
print(f" {i}. 图片路径: {item.get('img_path', 'N/A')}")
3 weeks ago
3 weeks ago
# 检查表格内容
3 weeks ago
table_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "table"
]
if table_items:
3 weeks ago
print(f"\n📊 找到 {len(table_items)} 个表格:")
3 weeks ago
for i, item in enumerate(table_items, 1):
table_body = item.get("table_body", "")
row_count = len(table_body.split("\n"))
3 weeks ago
print(f" {i}. 包含 {row_count} 行的表格")
3 weeks ago
3 weeks ago
print("\n🎉 Office文档解析测试成功完成!")
print("📁 输出文件保存到: ./test_output")
3 weeks ago
return True
3 weeks ago
# 在test_office_document_parsing函数中添加
pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
print(f"PDF文件内容预览(前100字符): {pdf_path.read_text()[:100]}")
3 weeks ago
except Exception as e:
3 weeks ago
print(f"\n❌ Office文档解析失败: {str(e)}")
3 weeks ago
import traceback
3 weeks ago
print(f" 完整错误: {traceback.format_exc()}")
3 weeks ago
return False
def main():
3 weeks ago
"""
主函数
处理命令行参数并执行测试
"""
# 固定文档路径
file_path = r"../Txt/小学数学教学中的若干问题_MATH_1.docx"
# 检查LibreOffice安装
print("🔧 检查LibreOffice安装状态...")
3 weeks ago
if not check_libreoffice_installation():
return 1
3 weeks ago
# 运行解析测试
3 weeks ago
try:
3 weeks ago
success = test_office_document_parsing(file_path)
3 weeks ago
return 0 if success else 1
except KeyboardInterrupt:
3 weeks ago
print("\n⏹️ 测试被用户中断")
3 weeks ago
return 1
except Exception as e:
3 weeks ago
print(f"\n❌ 发生意外错误: {str(e)}")
3 weeks ago
return 1
if __name__ == "__main__":
sys.exit(main())
3 weeks ago