import sys from pathlib import Path from raganything import RAGAnything def test_office_document_parsing(file_path: str): supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"} if file_path.suffix.lower() not in supported_extensions: print(f"āŒ Unsupported file format: {file_path.suffix}") print(f" Supported formats: {', '.join(supported_extensions)}") return False print(f"šŸ“„ File format: {file_path.suffix.upper()}") print(f"šŸ“ File size: {file_path.stat().st_size / 1024:.1f} KB") rag = RAGAnything() try: # Test document parsing with MinerU print("\nšŸ”„ Testing document parsing with MinerU...") content_list, md_content = rag.parse_document( file_path=str(file_path), output_dir="./test_output", parse_method="auto", display_stats=True, ) print("āœ… Parsing successful!") print(f" šŸ“Š Content blocks: {len(content_list)}") print(f" šŸ“ Markdown length: {len(md_content)} characters") # Analyze content types content_types = {} for item in content_list: if isinstance(item, dict): content_type = item.get("type", "unknown") content_types[content_type] = content_types.get(content_type, 0) + 1 if content_types: print(" šŸ“‹ Content distribution:") for content_type, count in sorted(content_types.items()): print(f" • {content_type}: {count}") # Display some parsed content preview if md_content.strip(): print("\nšŸ“„ Parsed content preview (first 500 characters):") preview = md_content.strip()[:500] print(f" {preview}{'...' if len(md_content) > 500 else ''}") # Display some structured content examples text_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "text" ] if text_items: print("\nšŸ“ Sample text blocks:") for i, item in enumerate(text_items[:3], 1): text_content = item.get("text", "") if text_content.strip(): preview = text_content.strip()[:200] print( f" {i}. {preview}{'...' if len(text_content) > 200 else ''}" ) # Check for images image_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "image" ] if image_items: print(f"\nšŸ–¼ļø Found {len(image_items)} image(s):") for i, item in enumerate(image_items, 1): print(f" {i}. Image path: {item.get('img_path', 'N/A')}") # Check for tables table_items = [ item for item in content_list if isinstance(item, dict) and item.get("type") == "table" ] if table_items: print(f"\nšŸ“Š Found {len(table_items)} table(s):") for i, item in enumerate(table_items, 1): table_body = item.get("table_body", "") row_count = len(table_body.split("\n")) print(f" {i}. Table with {row_count} rows") print("\nšŸŽ‰ Office document parsing test completed successfully!") print("šŸ“ Output files saved to: ./test_output") return True except Exception as e: print(f"\nāŒ Office document parsing failed: {str(e)}") import traceback print(f" Full error: {traceback.format_exc()}") return False def main(): file=r"D:\dsWork\dsProject\dsRagAnything\Txt\å°å­¦ę•°å­¦ę•™å­¦äø­ēš„č‹„å¹²é—®é¢˜_MATH_1.docx" test_office_document_parsing(file) if __name__ == "__main__": sys.exit(main())