|
|
|
@ -0,0 +1,45 @@
|
|
|
|
|
# 参考文档
|
|
|
|
|
# https://blog.csdn.net/make_progress/article/details/144490719
|
|
|
|
|
|
|
|
|
|
# 魔搭社区下载DoclingModels
|
|
|
|
|
# https://modelscope.cn/models/ds4sd/docling-models
|
|
|
|
|
# pip install modelscope
|
|
|
|
|
# modelscope download --model ds4sd/docling-models --local_dir D:\Model\docling
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from docling.datamodel.base_models import InputFormat
|
|
|
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
|
|
|
|
|
from docling.document_converter import PdfFormatOption, DocumentConverter, ImageFormatOption, PowerpointFormatOption, \
|
|
|
|
|
WordFormatOption, ExcelFormatOption, HTMLFormatOption
|
|
|
|
|
|
|
|
|
|
# 参考接口地址
|
|
|
|
|
# https://ds4sd.github.io/docling/v2/#access-document-structures
|
|
|
|
|
|
|
|
|
|
# 配置OCR模型,设置EasyOCR模型的路径
|
|
|
|
|
easyocr_model_storage_directory = "D:/Model/ocr/EasyOCR/model"
|
|
|
|
|
easyocr_options = EasyOcrOptions()
|
|
|
|
|
easyocr_options.model_storage_directory = easyocr_model_storage_directory
|
|
|
|
|
|
|
|
|
|
# 配置pdf模型,设置Docling模型的路径
|
|
|
|
|
pdf_artifacts_path = "D:/Model/docling"
|
|
|
|
|
pdf_pipeline_options = PdfPipelineOptions(artifacts_path=pdf_artifacts_path)
|
|
|
|
|
pdf_pipeline_options.ocr_options = easyocr_options
|
|
|
|
|
|
|
|
|
|
# 转换模型
|
|
|
|
|
converter = DocumentConverter(
|
|
|
|
|
format_options={
|
|
|
|
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
|
|
|
|
|
InputFormat.IMAGE: ImageFormatOption(pipeline_options=pdf_pipeline_options),
|
|
|
|
|
InputFormat.PPTX: PowerpointFormatOption(pipeline_options=pdf_pipeline_options),
|
|
|
|
|
InputFormat.DOCX: WordFormatOption(pipeline_options=pdf_pipeline_options),
|
|
|
|
|
InputFormat.XLSX: ExcelFormatOption(pipeline_options=pdf_pipeline_options),
|
|
|
|
|
InputFormat.HTML: HTMLFormatOption(pipeline_options=pdf_pipeline_options)
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
source = "E:/test/test.pdf"
|
|
|
|
|
result = converter.convert(source)
|
|
|
|
|
print(result.document.export_to_markdown())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|