main
HuangHai 4 weeks ago
parent dfe89874fd
commit ec3d208b59

@ -0,0 +1,45 @@
# 参考文档
# https://blog.csdn.net/make_progress/article/details/144490719
# 魔搭社区下载DoclingModels
# https://modelscope.cn/models/ds4sd/docling-models
# pip install modelscope
# modelscope download --model ds4sd/docling-models --local_dir D:\Model\docling
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
from docling.document_converter import PdfFormatOption, DocumentConverter, ImageFormatOption, PowerpointFormatOption, \
WordFormatOption, ExcelFormatOption, HTMLFormatOption
# 参考接口地址
# https://ds4sd.github.io/docling/v2/#access-document-structures
# 配置OCR模型设置EasyOCR模型的路径
easyocr_model_storage_directory = "D:/Model/ocr/EasyOCR/model"
easyocr_options = EasyOcrOptions()
easyocr_options.model_storage_directory = easyocr_model_storage_directory
# 配置pdf模型设置Docling模型的路径
pdf_artifacts_path = "D:/Model/docling"
pdf_pipeline_options = PdfPipelineOptions(artifacts_path=pdf_artifacts_path)
pdf_pipeline_options.ocr_options = easyocr_options
# 转换模型
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.IMAGE: ImageFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.PPTX: PowerpointFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.DOCX: WordFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.XLSX: ExcelFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.HTML: HTMLFormatOption(pipeline_options=pdf_pipeline_options)
}
)
source = "E:/test/test.pdf"
result = converter.convert(source)
print(result.document.export_to_markdown())

@ -0,0 +1,24 @@
"""
conda activate rag
# 安装docling
pip install docling
# 安装OCR工具
pip install easyocr
"""
import easyocr
ocr_model_path = "E:/model/ocr/EasyOCR/model"
reader = easyocr.Reader(
# 设置识别图片中语言的模型
lang_list=['ch_sim', 'en'],
# 注意会自动下载模型到路径下可以不指定模型目录会自动下载到C盘下
# 包括三个模型
# 文本检测模型CRAFT -- craft_mlt_25k.pth
# 中文简体模型ch_sim -- zh_sim_g2.pth
# 英文模型en -- latin_g2.pth
model_storage_directory=ocr_model_path
)
result = reader.readtext("E:/test/test.png")
print(result)
Loading…
Cancel
Save