# 参考文档 # https://blog.csdn.net/make_progress/article/details/144490719 # 魔搭社区下载DoclingModels # https://modelscope.cn/models/ds4sd/docling-models # pip install modelscope # modelscope download --model ds4sd/docling-models --local_dir D:\Model\docling from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions from docling.document_converter import PdfFormatOption, DocumentConverter, ImageFormatOption, PowerpointFormatOption, \ WordFormatOption, ExcelFormatOption, HTMLFormatOption # 参考接口地址 # https://ds4sd.github.io/docling/v2/#access-document-structures # 配置OCR模型,设置EasyOCR模型的路径 easyocr_model_storage_directory = "D:/Model/ocr/EasyOCR/model" easyocr_options = EasyOcrOptions() easyocr_options.model_storage_directory = easyocr_model_storage_directory # 配置pdf模型,设置Docling模型的路径 pdf_artifacts_path = "D:/Model/docling" pdf_pipeline_options = PdfPipelineOptions(artifacts_path=pdf_artifacts_path) pdf_pipeline_options.ocr_options = easyocr_options # 转换模型 converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options), InputFormat.IMAGE: ImageFormatOption(pipeline_options=pdf_pipeline_options), InputFormat.PPTX: PowerpointFormatOption(pipeline_options=pdf_pipeline_options), InputFormat.DOCX: WordFormatOption(pipeline_options=pdf_pipeline_options), InputFormat.XLSX: ExcelFormatOption(pipeline_options=pdf_pipeline_options), InputFormat.HTML: HTMLFormatOption(pipeline_options=pdf_pipeline_options) } ) source = "E:/test/test.pdf" result = converter.convert(source) print(result.document.export_to_markdown())