#!/usr/bin/env python3 """ Text Format Parsing Test Script for RAG-Anything This script demonstrates how to parse various text formats using MinerU, including TXT and MD files. Requirements: - ReportLab library for PDF conversion - RAG-Anything package Usage: python text_format_test.py --file path/to/text/document.md """ import argparse import sys from pathlib import Path from raganything import RAGAnything def check_reportlab_installation(): """Check if ReportLab is installed and available""" try: import reportlab print( f"โœ… ReportLab found: version {reportlab.Version if hasattr(reportlab, 'Version') else 'Unknown'}" ) return True except ImportError: print("โŒ ReportLab not found. Please install ReportLab:") print(" pip install reportlab") return False def test_text_format_parsing(file_path: str): """Test text format parsing with MinerU""" print(f"๐Ÿงช Testing text format parsing: {file_path}") # Check if file exists and is a supported text format file_path = Path(file_path) if not file_path.exists(): print(f"โŒ File does not exist: {file_path}") return False supported_extensions = {".txt", ".md"} if file_path.suffix.lower() not in supported_extensions: print(f"โŒ Unsupported file format: {file_path.suffix}") print(f" Supported formats: {', '.join(supported_extensions)}") return False print(f"๐Ÿ“„ File format: {file_path.suffix.upper()}") print(f"๐Ÿ“ File size: {file_path.stat().st_size / 1024:.1f} KB") # Display text file info try: with open(file_path, "r", encoding="utf-8") as f: content = f.read() print(f"๐Ÿ“ Text length: {len(content)} characters") print(f"๐Ÿ“‹ Line count: {len(content.splitlines())}") except UnicodeDecodeError: print( "โš ๏ธ Text encoding: Non-UTF-8 (will try multiple encodings during processing)" ) # Initialize RAGAnything (only for parsing functionality) rag = RAGAnything(working_dir="./temp_parsing_test") try: # Test text parsing with MinerU print("\n๐Ÿ”„ Testing text parsing with MinerU...") content_list, md_content = rag.parse_document( file_path=str(file_path), output_dir="./test_output", parse_method="auto", display_stats=True, ) print("โœ… Parsing successful!") print(f" ๐Ÿ“Š Content blocks: {len(content_list)}") print(f" ๐Ÿ“ Markdown length: {len(md_content)} characters") # Analyze content types content_types = {} for item in content_list: if isinstance(item, dict): content_type = item.get("type", "unknown") content_types[content_type] = content_types.get(content_type, 0) + 1 if content_types: print(" ๐Ÿ“‹ Content distribution:") for content_type, count in sorted(content_types.items()): print(f" โ€ข {content_type}: {count}") # Display extracted text (if any) if md_content.strip(): print("\n๐Ÿ“„ Extracted text preview (first 500 characters):") preview = md_content.strip()[:500] print(f" {preview}{'...' if len(md_content) > 500 else ''}") else: print("\n๐Ÿ“„ No text extracted from the document") # Display text blocks text_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "text" ] if text_items: print("\n๐Ÿ“ Text blocks found:") for i, item in enumerate(text_items[:3], 1): text_content = item.get("text", "") if text_content.strip(): preview = text_content.strip()[:200] print( f" {i}. {preview}{'...' if len(text_content) > 200 else ''}" ) # Check for any tables detected in the text table_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "table" ] if table_items: print(f"\n๐Ÿ“Š Found {len(table_items)} table(s) in document:") for i, item in enumerate(table_items, 1): table_body = item.get("table_body", "") row_count = len(table_body.split("\n")) print(f" {i}. Table with {row_count} rows") # Check for images (unlikely in text files but possible in MD) image_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "image" ] if image_items: print(f"\n๐Ÿ–ผ๏ธ Found {len(image_items)} image(s):") for i, item in enumerate(image_items, 1): print(f" {i}. Image path: {item.get('img_path', 'N/A')}") print("\n๐ŸŽ‰ Text format parsing test completed successfully!") print("๐Ÿ“ Output files saved to: ./test_output") return True except Exception as e: print(f"\nโŒ Text format parsing failed: {str(e)}") import traceback print(f" Full error: {traceback.format_exc()}") return False def main(): """Main function""" parser = argparse.ArgumentParser(description="Test text format parsing with MinerU") parser.add_argument("--file", required=True, help="Path to the text file to test") parser.add_argument( "--check-reportlab", action="store_true", help="Only check ReportLab installation", ) args = parser.parse_args() # Check ReportLab installation print("๐Ÿ”ง Checking ReportLab installation...") if not check_reportlab_installation(): return 1 if args.check_reportlab: print("โœ… ReportLab installation check passed!") return 0 # Run the parsing test try: success = test_text_format_parsing(args.file) return 0 if success else 1 except KeyboardInterrupt: print("\nโน๏ธ Test interrupted by user") return 1 except Exception as e: print(f"\nโŒ Unexpected error: {str(e)}") return 1 if __name__ == "__main__": sys.exit(main())