|
|
|
@ -1,12 +1,8 @@
|
|
|
|
|
import sys
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
from raganything import RAGAnything
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_office_document_parsing(file_path: str):
|
|
|
|
|
def office_document_parsing(file_path: str):
|
|
|
|
|
supported_extensions = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
|
|
|
|
|
if file_path.suffix.lower() not in supported_extensions:
|
|
|
|
|
print(f"❌ Unsupported file format: {file_path.suffix}")
|
|
|
|
@ -19,7 +15,6 @@ def test_office_document_parsing(file_path: str):
|
|
|
|
|
rag = RAGAnything()
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Test document parsing with MinerU
|
|
|
|
|
print("\n🔄 Testing document parsing with MinerU...")
|
|
|
|
|
content_list, md_content = rag.parse_document(
|
|
|
|
|
file_path=str(file_path),
|
|
|
|
@ -105,7 +100,7 @@ def test_office_document_parsing(file_path: str):
|
|
|
|
|
def main():
|
|
|
|
|
file=r"D:\dsWork\dsProject\dsRagAnything\Txt\小学数学教学中的若干问题_MATH_1.docx"
|
|
|
|
|
|
|
|
|
|
test_office_document_parsing(file)
|
|
|
|
|
office_document_parsing(file)
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
sys.exit(main())
|