diff --git a/dsRag/Test/TestDocling.py b/dsRag/Test/TestDocling.py new file mode 100644 index 00000000..cd70bcfa --- /dev/null +++ b/dsRag/Test/TestDocling.py @@ -0,0 +1,45 @@ +# 参考文档 +# https://blog.csdn.net/make_progress/article/details/144490719 + +# 魔搭社区下载DoclingModels +# https://modelscope.cn/models/ds4sd/docling-models +# pip install modelscope +# modelscope download --model ds4sd/docling-models --local_dir D:\Model\docling + + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions +from docling.document_converter import PdfFormatOption, DocumentConverter, ImageFormatOption, PowerpointFormatOption, \ + WordFormatOption, ExcelFormatOption, HTMLFormatOption + +# 参考接口地址 +# https://ds4sd.github.io/docling/v2/#access-document-structures + +# 配置OCR模型,设置EasyOCR模型的路径 +easyocr_model_storage_directory = "D:/Model/ocr/EasyOCR/model" +easyocr_options = EasyOcrOptions() +easyocr_options.model_storage_directory = easyocr_model_storage_directory + +# 配置pdf模型,设置Docling模型的路径 +pdf_artifacts_path = "D:/Model/docling" +pdf_pipeline_options = PdfPipelineOptions(artifacts_path=pdf_artifacts_path) +pdf_pipeline_options.ocr_options = easyocr_options + +# 转换模型 +converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options), + InputFormat.IMAGE: ImageFormatOption(pipeline_options=pdf_pipeline_options), + InputFormat.PPTX: PowerpointFormatOption(pipeline_options=pdf_pipeline_options), + InputFormat.DOCX: WordFormatOption(pipeline_options=pdf_pipeline_options), + InputFormat.XLSX: ExcelFormatOption(pipeline_options=pdf_pipeline_options), + InputFormat.HTML: HTMLFormatOption(pipeline_options=pdf_pipeline_options) + } +) + +source = "E:/test/test.pdf" +result = converter.convert(source) +print(result.document.export_to_markdown()) + + + diff --git a/dsRag/Test/TestEasyOCR.py b/dsRag/Test/TestEasyOCR.py new file mode 100644 index 00000000..c45aedf2 --- /dev/null +++ b/dsRag/Test/TestEasyOCR.py @@ -0,0 +1,24 @@ +""" +conda activate rag + +# 安装docling +pip install docling + +# 安装OCR工具 +pip install easyocr +""" + +import easyocr +ocr_model_path = "E:/model/ocr/EasyOCR/model" +reader = easyocr.Reader( + # 设置识别图片中语言的模型 + lang_list=['ch_sim', 'en'], + # 注意会自动下载模型到路径下,可以不指定模型目录,会自动下载到C盘下 + # 包括三个模型 + # 文本检测模型:CRAFT -- craft_mlt_25k.pth + # 中文简体模型:ch_sim -- zh_sim_g2.pth + # 英文模型:en -- latin_g2.pth + model_storage_directory=ocr_model_path +) +result = reader.readtext("E:/test/test.png") +print(result)