You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

229 lines
7.4 KiB

3 weeks ago
#!/usr/bin/env python3
"""
Image Format Parsing Test Script for RAG-Anything
This script demonstrates how to parse various image formats
using MinerU, including JPG, PNG, BMP, TIFF, GIF, and WebP files.
Requirements:
- PIL/Pillow library for format conversion
- RAG-Anything package
Usage:
python image_format_test.py --file path/to/image.bmp
"""
import argparse
import sys
from pathlib import Path
from raganything import RAGAnything
def check_pillow_installation():
"""Check if PIL/Pillow is installed and available"""
try:
from PIL import Image
print(
f"✅ PIL/Pillow found: PIL version {Image.__version__ if hasattr(Image, '__version__') else 'Unknown'}"
)
return True
except ImportError:
print("❌ PIL/Pillow not found. Please install Pillow:")
print(" pip install Pillow")
return False
def get_image_info(image_path: Path):
"""Get detailed image information"""
try:
from PIL import Image
with Image.open(image_path) as img:
return {
"format": img.format,
"mode": img.mode,
"size": img.size,
"has_transparency": img.mode in ("RGBA", "LA")
or "transparency" in img.info,
}
except Exception as e:
return {"error": str(e)}
def test_image_format_parsing(file_path: str):
"""Test image format parsing with MinerU"""
print(f"🧪 Testing image format parsing: {file_path}")
# Check if file exists and is a supported image format
file_path = Path(file_path)
if not file_path.exists():
print(f"❌ File does not exist: {file_path}")
return False
supported_extensions = {
".jpg",
".jpeg",
".png",
".bmp",
".tiff",
".tif",
".gif",
".webp",
}
if file_path.suffix.lower() not in supported_extensions:
print(f"❌ Unsupported file format: {file_path.suffix}")
print(f" Supported formats: {', '.join(supported_extensions)}")
return False
print(f"📸 File format: {file_path.suffix.upper()}")
print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")
# Get detailed image information
img_info = get_image_info(file_path)
if "error" not in img_info:
print("🖼️ Image info:")
print(f" • Format: {img_info['format']}")
print(f" • Mode: {img_info['mode']}")
print(f" • Size: {img_info['size'][0]}x{img_info['size'][1]}")
print(f" • Has transparency: {img_info['has_transparency']}")
# Check format compatibility with MinerU
mineru_native_formats = {".jpg", ".jpeg", ".png"}
needs_conversion = file_path.suffix.lower() not in mineru_native_formats
if needs_conversion:
print(
f" Format {file_path.suffix.upper()} will be converted to PNG for MinerU compatibility"
)
else:
print(f"✅ Format {file_path.suffix.upper()} is natively supported by MinerU")
# Initialize RAGAnything (only for parsing functionality)
rag = RAGAnything(working_dir="./temp_parsing_test")
try:
# Test image parsing with MinerU
print("\n🔄 Testing image parsing with MinerU...")
content_list, md_content = rag.parse_document(
file_path=str(file_path),
output_dir="./test_output",
parse_method="ocr", # Images use OCR method
display_stats=True,
)
print("✅ Parsing successful!")
print(f" 📊 Content blocks: {len(content_list)}")
print(f" 📝 Markdown length: {len(md_content)} characters")
# Analyze content types
content_types = {}
for item in content_list:
if isinstance(item, dict):
content_type = item.get("type", "unknown")
content_types[content_type] = content_types.get(content_type, 0) + 1
if content_types:
print(" 📋 Content distribution:")
for content_type, count in sorted(content_types.items()):
print(f"{content_type}: {count}")
# Display extracted text (if any)
if md_content.strip():
print("\n📄 Extracted text preview (first 500 characters):")
preview = md_content.strip()[:500]
print(f" {preview}{'...' if len(md_content) > 500 else ''}")
else:
print("\n📄 No text extracted from the image")
# Display image processing results
image_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "image"
]
if image_items:
print(f"\n🖼️ Found {len(image_items)} processed image(s):")
for i, item in enumerate(image_items, 1):
print(f" {i}. Image path: {item.get('img_path', 'N/A')}")
if item.get("img_caption"):
print(
f" Caption: {item.get('img_caption', [])[0] if item.get('img_caption') else 'N/A'}"
)
# Display text blocks (OCR results)
text_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "text"
]
if text_items:
print("\n📝 OCR text blocks found:")
for i, item in enumerate(text_items, 1):
text_content = item.get("text", "")
if text_content.strip():
preview = text_content.strip()[:200]
print(
f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"
)
# Check for any tables detected in the image
table_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "table"
]
if table_items:
print(f"\n📊 Found {len(table_items)} table(s) in image:")
for i, item in enumerate(table_items, 1):
print(f" {i}. Table detected with content")
print("\n🎉 Image format parsing test completed successfully!")
print("📁 Output files saved to: ./test_output")
return True
except Exception as e:
print(f"\n❌ Image format parsing failed: {str(e)}")
import traceback
print(f" Full error: {traceback.format_exc()}")
return False
def main():
"""Main function"""
parser = argparse.ArgumentParser(
description="Test image format parsing with MinerU"
)
parser.add_argument("--file", required=True, help="Path to the image file to test")
parser.add_argument(
"--check-pillow", action="store_true", help="Only check PIL/Pillow installation"
)
args = parser.parse_args()
# Check PIL/Pillow installation
print("🔧 Checking PIL/Pillow installation...")
if not check_pillow_installation():
return 1
if args.check_pillow:
print("✅ PIL/Pillow installation check passed!")
return 0
# Run the parsing test
try:
success = test_image_format_parsing(args.file)
return 0 if success else 1
except KeyboardInterrupt:
print("\n⏹️ Test interrupted by user")
return 1
except Exception as e:
print(f"\n❌ Unexpected error: {str(e)}")
return 1
if __name__ == "__main__":
sys.exit(main())