You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

106 lines
3.8 KiB

4 weeks ago
import sys
4 weeks ago
from raganything import RAGAnything
4 weeks ago
3 weeks ago
def office_document_parsing(file_path: str):
4 weeks ago
supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
if file_path.suffix.lower() not in supported_extensions:
4 weeks ago
print(f"❌ Unsupported file format: {file_path.suffix}")
print(f" Supported formats: {', '.join(supported_extensions)}")
4 weeks ago
return False
4 weeks ago
print(f"📄 File format: {file_path.suffix.upper()}")
print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")
4 weeks ago
4 weeks ago
rag = RAGAnything()
4 weeks ago
try:
4 weeks ago
print("\n🔄 Testing document parsing with MinerU...")
4 weeks ago
content_list, md_content = rag.parse_document(
file_path=str(file_path),
4 weeks ago
output_dir="./test_output",
4 weeks ago
parse_method="auto",
display_stats=True,
)
4 weeks ago
print("✅ Parsing successful!")
print(f" 📊 Content blocks: {len(content_list)}")
print(f" 📝 Markdown length: {len(md_content)} characters")
# Analyze content types
4 weeks ago
content_types = {}
for item in content_list:
if isinstance(item, dict):
content_type = item.get("type", "unknown")
content_types[content_type] = content_types.get(content_type, 0) + 1
if content_types:
4 weeks ago
print(" 📋 Content distribution:")
4 weeks ago
for content_type, count in sorted(content_types.items()):
print(f"{content_type}: {count}")
4 weeks ago
# Display some parsed content preview
4 weeks ago
if md_content.strip():
4 weeks ago
print("\n📄 Parsed content preview (first 500 characters):")
4 weeks ago
preview = md_content.strip()[:500]
print(f" {preview}{'...' if len(md_content) > 500 else ''}")
4 weeks ago
# Display some structured content examples
4 weeks ago
text_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "text"
]
if text_items:
4 weeks ago
print("\n📝 Sample text blocks:")
4 weeks ago
for i, item in enumerate(text_items[:3], 1):
text_content = item.get("text", "")
if text_content.strip():
preview = text_content.strip()[:200]
print(
f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"
)
4 weeks ago
# Check for images
4 weeks ago
image_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "image"
]
if image_items:
4 weeks ago
print(f"\n🖼️ Found {len(image_items)} image(s):")
4 weeks ago
for i, item in enumerate(image_items, 1):
4 weeks ago
print(f" {i}. Image path: {item.get('img_path', 'N/A')}")
4 weeks ago
4 weeks ago
# Check for tables
4 weeks ago
table_items = [
item
for item in content_list
if isinstance(item, dict) and item.get("type") == "table"
]
if table_items:
4 weeks ago
print(f"\n📊 Found {len(table_items)} table(s):")
4 weeks ago
for i, item in enumerate(table_items, 1):
table_body = item.get("table_body", "")
row_count = len(table_body.split("\n"))
4 weeks ago
print(f" {i}. Table with {row_count} rows")
4 weeks ago
4 weeks ago
print("\n🎉 Office document parsing test completed successfully!")
print("📁 Output files saved to: ./test_output")
4 weeks ago
return True
except Exception as e:
4 weeks ago
print(f"\n❌ Office document parsing failed: {str(e)}")
4 weeks ago
import traceback
4 weeks ago
print(f" Full error: {traceback.format_exc()}")
4 weeks ago
return False
def main():
4 weeks ago
file=r"D:\dsWork\dsProject\dsRagAnything\Txt\小学数学教学中的若干问题_MATH_1.docx"
4 weeks ago
3 weeks ago
office_document_parsing(file)
4 weeks ago
if __name__ == "__main__":
4 weeks ago
sys.exit(main())