main
HuangHai 3 weeks ago
parent c1dd5684a1
commit 3a9fbad0ff

@ -8,4 +8,33 @@ conda create -n raganything python=3.10
conda activate raganything conda activate raganything
# 下一步需要测试的库 # 下一步需要测试的库
https://github.com/HKUDS/VideoRAG https://github.com/HKUDS/VideoRAG
# 添加到PATH
C:\Program Files\LibreOffice\program
# Office document parsing test (MinerU only)
python examples/office_document_test.py --file path/to/document.docx
# Check LibreOffice installation
python examples/office_document_test.py --check-libreoffice --file dummy
# End-to-end processing
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY
# Direct modal processing
python examples/modalprocessors_example.py --api-key YOUR_API_KEY
# Image format parsing test (MinerU only)
python examples/image_format_test.py --file path/to/image.bmp
# Text format parsing test (MinerU only)
python examples/text_format_test.py --file path/to/document.md
# Check PIL/Pillow installation
python examples/image_format_test.py --check-pillow --file dummy
# Check ReportLab installation
python examples/text_format_test.py --check-reportlab --file dummy

@ -1,16 +1,18 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Office Document Parsing Test Script for RAG-Anything Office文档解析测试脚本 - RAG-Anything项目
This script demonstrates how to parse various Office document formats 本脚本演示如何使用MinerU解析各种Office文档格式包括
using MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files. - DOC/DOCX (Word文档)
- PPT/PPTX (PowerPoint演示文稿)
- XLS/XLSX (Excel电子表格)
Requirements: 要求
- LibreOffice installed on the system 1. 系统已安装LibreOffice
- RAG-Anything package 2. 已安装RAG-Anything包
Usage: 使用方法
python office_document_test.py --file path/to/office/document.docx python office_document_test.py --file 办公文档路径.docx
""" """
import argparse import argparse
@ -20,15 +22,24 @@ from raganything import RAGAnything
def check_libreoffice_installation(): def check_libreoffice_installation():
"""Check if LibreOffice is installed and available""" """
检查LibreOffice是否已安装并可用
返回:
bool: 如果LibreOffice可用返回True否则返回False
"""
import subprocess import subprocess
# 尝试不同的LibreOffice命令名称
for cmd in ["libreoffice", "soffice"]: for cmd in ["libreoffice", "soffice"]:
try: try:
result = subprocess.run( result = subprocess.run(
[cmd, "--version"], capture_output=True, check=True, timeout=10 [cmd, "--version"],
capture_output=True,
check=True,
timeout=10
) )
print(f"✅ LibreOffice found: {result.stdout.decode().strip()}") print(f"找到LibreOffice: {result.stdout.decode().strip()}")
return True return True
except ( except (
subprocess.CalledProcessError, subprocess.CalledProcessError,
@ -37,8 +48,9 @@ def check_libreoffice_installation():
): ):
continue continue
print("❌ LibreOffice not found. Please install LibreOffice:") # 如果未找到LibreOffice显示安装指南
print(" - Windows: Download from https://www.libreoffice.org/download/download/") print("❌ 未找到LibreOffice. 请安装LibreOffice:")
print(" - Windows: 从 https://www.libreoffice.org/download/download/ 下载")
print(" - macOS: brew install --cask libreoffice") print(" - macOS: brew install --cask libreoffice")
print(" - Ubuntu/Debian: sudo apt-get install libreoffice") print(" - Ubuntu/Debian: sudo apt-get install libreoffice")
print(" - CentOS/RHEL: sudo yum install libreoffice") print(" - CentOS/RHEL: sudo yum install libreoffice")
@ -46,31 +58,42 @@ def check_libreoffice_installation():
def test_office_document_parsing(file_path: str): def test_office_document_parsing(file_path: str):
"""Test Office document parsing with MinerU""" """
测试Office文档解析功能
print(f"🧪 Testing Office document parsing: {file_path}")
参数:
# Check if file exists and is a supported Office format file_path (str): 要测试的Office文档路径
返回:
bool: 解析成功返回True否则返回False
"""
print(f"🧪 测试Office文档解析: {file_path}")
# 检查文件是否存在且是支持的Office格式
file_path = Path(file_path) file_path = Path(file_path)
if not file_path.exists(): if not file_path.exists():
print(f"❌ File does not exist: {file_path}") print(f"文件不存在: {file_path}")
return False return False
# 支持的文档扩展名列表
supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"} supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
if file_path.suffix.lower() not in supported_extensions: if file_path.suffix.lower() not in supported_extensions:
print(f"Unsupported file format: {file_path.suffix}") print(f"不支持的文档格式: {file_path.suffix}")
print(f" Supported formats: {', '.join(supported_extensions)}") print(f" 支持的格式: {', '.join(supported_extensions)}")
return False return False
print(f"📄 File format: {file_path.suffix.upper()}") # 显示文档基本信息
print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB") print(f"📄 文档格式: {file_path.suffix.upper()}")
print(f"📏 文件大小: {file_path.stat().st_size / 1024:.1f} KB")
# Initialize RAGAnything (only for parsing functionality) # 初始化RAGAnything(仅用于解析功能)
rag = RAGAnything(working_dir="./temp_parsing_test") from raganything.config import RAGAnythingConfig
config = RAGAnythingConfig(working_dir="./temp_parsing_test")
rag = RAGAnything(config=config)
try: try:
# Test document parsing with MinerU # 使用MinerU测试文档解析
print("\n🔄 Testing document parsing with MinerU...") print("\n🔄 使用MinerU测试文档解析...")
content_list, md_content = rag.parse_document( content_list, md_content = rag.parse_document(
file_path=str(file_path), file_path=str(file_path),
output_dir="./test_output", output_dir="./test_output",
@ -78,11 +101,11 @@ def test_office_document_parsing(file_path: str):
display_stats=True, display_stats=True,
) )
print("Parsing successful!") print("解析成功!")
print(f" 📊 Content blocks: {len(content_list)}") print(f" 📊 内容块数量: {len(content_list)}")
print(f" 📝 Markdown length: {len(md_content)} characters") print(f" 📝 Markdown长度: {len(md_content)} 字符")
# Analyze content types # 分析内容类型分布
content_types = {} content_types = {}
for item in content_list: for item in content_list:
if isinstance(item, dict): if isinstance(item, dict):
@ -90,24 +113,24 @@ def test_office_document_parsing(file_path: str):
content_types[content_type] = content_types.get(content_type, 0) + 1 content_types[content_type] = content_types.get(content_type, 0) + 1
if content_types: if content_types:
print(" 📋 Content distribution:") print(" 📋 内容类型分布:")
for content_type, count in sorted(content_types.items()): for content_type, count in sorted(content_types.items()):
print(f"{content_type}: {count}") print(f"{content_type}: {count}")
# Display some parsed content preview # 显示解析内容预览
if md_content.strip(): if md_content.strip():
print("\n📄 Parsed content preview (first 500 characters):") print("\n📄 解析内容预览(前500字符):")
preview = md_content.strip()[:500] preview = md_content.strip()[:500]
print(f" {preview}{'...' if len(md_content) > 500 else ''}") print(f" {preview}{'...' if len(md_content) > 500 else ''}")
# Display some structured content examples # 显示文本块示例
text_items = [ text_items = [
item item
for item in content_list for item in content_list
if isinstance(item, dict) and item.get("type") == "text" if isinstance(item, dict) and item.get("type") == "text"
] ]
if text_items: if text_items:
print("\n📝 Sample text blocks:") print("\n📝 文本块示例:")
for i, item in enumerate(text_items[:3], 1): for i, item in enumerate(text_items[:3], 1):
text_content = item.get("text", "") text_content = item.get("text", "")
if text_content.strip(): if text_content.strip():
@ -116,76 +139,65 @@ def test_office_document_parsing(file_path: str):
f" {i}. {preview}{'...' if len(text_content) > 200 else ''}" f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"
) )
# Check for images # 检查图片内容
image_items = [ image_items = [
item item
for item in content_list for item in content_list
if isinstance(item, dict) and item.get("type") == "image" if isinstance(item, dict) and item.get("type") == "image"
] ]
if image_items: if image_items:
print(f"\n🖼️ Found {len(image_items)} image(s):") print(f"\n🖼️ 找到 {len(image_items)} 张图片:")
for i, item in enumerate(image_items, 1): for i, item in enumerate(image_items, 1):
print(f" {i}. Image path: {item.get('img_path', 'N/A')}") print(f" {i}. 图片路径: {item.get('img_path', 'N/A')}")
# Check for tables # 检查表格内容
table_items = [ table_items = [
item item
for item in content_list for item in content_list
if isinstance(item, dict) and item.get("type") == "table" if isinstance(item, dict) and item.get("type") == "table"
] ]
if table_items: if table_items:
print(f"\n📊 Found {len(table_items)} table(s):") print(f"\n📊 找到 {len(table_items)} 个表格:")
for i, item in enumerate(table_items, 1): for i, item in enumerate(table_items, 1):
table_body = item.get("table_body", "") table_body = item.get("table_body", "")
row_count = len(table_body.split("\n")) row_count = len(table_body.split("\n"))
print(f" {i}. Table with {row_count} rows") print(f" {i}. 包含 {row_count} 行的表格")
print("\n🎉 Office document parsing test completed successfully!") print("\n🎉 Office文档解析测试成功完成!")
print("📁 Output files saved to: ./test_output") print("📁 输出文件保存到: ./test_output")
return True return True
except Exception as e: except Exception as e:
print(f"\n❌ Office document parsing failed: {str(e)}") print(f"\n❌ Office文档解析失败: {str(e)}")
import traceback import traceback
print(f" Full error: {traceback.format_exc()}") print(f" 完整错误: {traceback.format_exc()}")
return False return False
def main(): def main():
"""Main function""" """
parser = argparse.ArgumentParser( 主函数
description="Test Office document parsing with MinerU"
) 处理命令行参数并执行测试
parser.add_argument( """
"--file", required=True, help="Path to the Office document to test" # 固定文档路径
) file_path = r"../Txt/小学数学教学中的若干问题_MATH_1.docx"
parser.add_argument(
"--check-libreoffice", # 检查LibreOffice安装
action="store_true", print("🔧 检查LibreOffice安装状态...")
help="Only check LibreOffice installation",
)
args = parser.parse_args()
# Check LibreOffice installation
print("🔧 Checking LibreOffice installation...")
if not check_libreoffice_installation(): if not check_libreoffice_installation():
return 1 return 1
if args.check_libreoffice: # 运行解析测试
print("✅ LibreOffice installation check passed!")
return 0
# Run the parsing test
try: try:
success = test_office_document_parsing(args.file) success = test_office_document_parsing(file_path)
return 0 if success else 1 return 0 if success else 1
except KeyboardInterrupt: except KeyboardInterrupt:
print("\n⏹️ Test interrupted by user") print("\n⏹️ 测试被用户中断")
return 1 return 1
except Exception as e: except Exception as e:
print(f"\nUnexpected error: {str(e)}") print(f"\n发生意外错误: {str(e)}")
return 1 return 1

Loading…
Cancel
Save