You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

46 lines
1.8 KiB

1 month ago
# 参考文档
# https://blog.csdn.net/make_progress/article/details/144490719
# 魔搭社区下载DoclingModels
# https://modelscope.cn/models/ds4sd/docling-models
# pip install modelscope
# modelscope download --model ds4sd/docling-models --local_dir D:\Model\docling
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
from docling.document_converter import PdfFormatOption, DocumentConverter, ImageFormatOption, PowerpointFormatOption, \
WordFormatOption, ExcelFormatOption, HTMLFormatOption
# 参考接口地址
# https://ds4sd.github.io/docling/v2/#access-document-structures
# 配置OCR模型设置EasyOCR模型的路径
easyocr_model_storage_directory = "D:/Model/ocr/EasyOCR/model"
easyocr_options = EasyOcrOptions()
easyocr_options.model_storage_directory = easyocr_model_storage_directory
# 配置pdf模型设置Docling模型的路径
pdf_artifacts_path = "D:/Model/docling"
pdf_pipeline_options = PdfPipelineOptions(artifacts_path=pdf_artifacts_path)
pdf_pipeline_options.ocr_options = easyocr_options
# 转换模型
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.IMAGE: ImageFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.PPTX: PowerpointFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.DOCX: WordFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.XLSX: ExcelFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.HTML: HTMLFormatOption(pipeline_options=pdf_pipeline_options)
}
)
source = "E:/test/test.pdf"
result = converter.convert(source)
print(result.document.export_to_markdown())