You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

247 lines
7.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python3
"""
Office文档解析测试脚本 - RAG-Anything项目
本脚本演示如何使用MinerU解析各种Office文档格式包括
- DOC/DOCX (Word文档)
- PPT/PPTX (PowerPoint演示文稿)
- XLS/XLSX (Excel电子表格)
要求:
1. 系统已安装LibreOffice
2. 已安装RAG-Anything包
使用方法:
python office_document_test.py --file 办公文档路径.docx
"""
import argparse
import sys
from pathlib import Path
from raganything import RAGAnything, RAGAnythingConfig
def check_libreoffice_installation():
"""
检查LibreOffice是否已安装并可用
返回:
bool: 如果LibreOffice可用返回True否则返回False
"""
"""
import subprocess
# 尝试不同的LibreOffice命令名称
for cmd in ["libreoffice", "soffice"]:
try:
result = subprocess.run(
[cmd, "--version"],
capture_output=True,
check=True,
timeout=10
)
print(f"✅ 找到LibreOffice: {result.stdout.decode().strip()}")
return True
except (
subprocess.CalledProcessError,
FileNotFoundError,
subprocess.TimeoutExpired,
):
continue
# 如果未找到LibreOffice显示安装指南
print("❌ 未找到LibreOffice. 请安装LibreOffice:")
print(" - Windows: 从 https://www.libreoffice.org/download/download/ 下载")
print(" - macOS: brew install --cask libreoffice")
print(" - Ubuntu/Debian: sudo apt-get install libreoffice")
print(" - CentOS/RHEL: sudo yum install libreoffice")
return False
"""
return True
def test_office_document_parsing(file_path: str):
"""
测试Office文档解析功能
参数:
file_path (str): 要测试的Office文档路径
返回:
bool: 解析成功返回True否则返回False
"""
# 在test_office_document_parsing函数中添加
import os
os.environ["LIBREOFFICE_PATH"] = "C:\\Program Files\\LibreOffice\\program"
print(f"🧪 测试Office文档解析: {file_path}")
# 检查文件是否存在且是支持的Office格式
file_path = Path(file_path)
if not file_path.exists():
print(f"❌ 文件不存在: {file_path}")
return False
# 支持的文档扩展名列表
supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
if file_path.suffix.lower() not in supported_extensions:
print(f"❌ 不支持的文档格式: {file_path.suffix}")
print(f" 支持的格式: {', '.join(supported_extensions)}")
return False
# 显示文档基本信息
print(f"📄 文档格式: {file_path.suffix.upper()}")
print(f"📏 文件大小: {file_path.stat().st_size / 1024:.1f} KB")
# 初始化RAGAnything(仅用于解析功能)
from raganything.config import RAGAnythingConfig
config = RAGAnythingConfig(working_dir="./temp_parsing_test")
rag = RAGAnything(config=config)
try:
# 添加MinerU安装检查
from raganything.mineru_parser import MineruParser
if not MineruParser.check_installation():
print("❌ MinerU未正确安装")
return False
# 确保output_dir已定义
output_dir = "./test_output"
Path(output_dir).mkdir(exist_ok=True)
# 添加PDF转换检查
pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
print(f"PDF转换路径: {pdf_path}")
if pdf_path.exists():
print(f"✅ PDF已生成大小: {pdf_path.stat().st_size}字节")
else:
print("❌ PDF转换失败")
# 使用MinerU测试文档解析
print("\n🔄 使用MinerU测试文档解析...")
# 使用绝对路径确保输出目录位置明确
output_dir = Path("d:/dsWork/dsProject/dsRagAnything/Tools/test_output").absolute()
output_dir.mkdir(exist_ok=True)
content_list, md_content = rag.parse_document(
file_path=str(file_path),
output_dir=str(output_dir),
parse_method="auto",
display_stats=True,
)
# 检查输出目录内容
print(f"\n📂 输出目录内容({output_dir}):")
for f in output_dir.glob("*"):
print(f" - {f.name}")
print("✅ 解析成功!")
print(f" 📊 内容块数量: {len(content_list)}")
print(f" 📝 Markdown长度: {len(md_content)} 字符")
# 分析内容类型分布
content_types = {}
for item in content_list:
if isinstance(item, dict):
content_type = item.get("type", "unknown")
content_types[content_type] = content_types.get(content_type, 0) + 1
if content_types:
print(" 📋 内容类型分布:")
for content_type, count in sorted(content_types.items()):
print(f"{content_type}: {count}")
# 显示解析内容预览
if md_content.strip():
print("\n📄 解析内容预览(前500字符):")
preview = md_content.strip()[:500]
print(f" {preview}{'...' if len(md_content) > 500 else ''}")
# 显示文本块示例
text_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "text"
]
if text_items:
print("\n📝 文本块示例:")
for i, item in enumerate(text_items[:3], 1):
text_content = item.get("text", "")
if text_content.strip():
preview = text_content.strip()[:200]
print(
f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"
)
# 检查图片内容
image_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "image"
]
if image_items:
print(f"\n🖼️ 找到 {len(image_items)} 张图片:")
for i, item in enumerate(image_items, 1):
print(f" {i}. 图片路径: {item.get('img_path', 'N/A')}")
# 检查表格内容
table_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "table"
]
if table_items:
print(f"\n📊 找到 {len(table_items)} 个表格:")
for i, item in enumerate(table_items, 1):
table_body = item.get("table_body", "")
row_count = len(table_body.split("\n"))
print(f" {i}. 包含 {row_count} 行的表格")
print("\n🎉 Office文档解析测试成功完成!")
print("📁 输出文件保存到: ./test_output")
return True
# 在test_office_document_parsing函数中添加
pdf_path = Path(output_dir) / f"{file_path.stem}.pdf"
print(f"PDF文件内容预览(前100字符): {pdf_path.read_text()[:100]}")
except Exception as e:
print(f"\n❌ Office文档解析失败: {str(e)}")
import traceback
print(f" 完整错误: {traceback.format_exc()}")
return False
def main():
"""
主函数
处理命令行参数并执行测试
"""
# 固定文档路径
file_path = r"../Txt/小学数学教学中的若干问题_MATH_1.docx"
# 检查LibreOffice安装
print("🔧 检查LibreOffice安装状态...")
if not check_libreoffice_installation():
return 1
# 运行解析测试
try:
success = test_office_document_parsing(file_path)
return 0 if success else 1
except KeyboardInterrupt:
print("\n⏹️ 测试被用户中断")
return 1
except Exception as e:
print(f"\n❌ 发生意外错误: {str(e)}")
return 1
if __name__ == "__main__":
sys.exit(main())