Files
dsProject/dsLightRag/Util/OCR_URL_4_Paddle.py
2025-08-14 15:45:08 +08:00

40 lines
1.3 KiB
Python

import logging
# 直接获取模块专属日志器(无需重复配置)
logger = logging.getLogger(__name__)
# 创建模型
# pipeline = PPStructureV3(use_doc_orientation_classify=False,use_doc_unwarping=False)
# def ocrWithPPStructureV3(image_paths: array):
# # 对给定图片进行OCR识别
# for image_path in image_paths:
# logger.info(f"正在识别图片:{image_path}")
# md5 = image_path.replace("\\", "/").split("/")[-2]
# output = pipeline.predict(input=image_path)
# output_path = f"output/{md5}"
# # 如果输出目录不存在,则创建
# if not os.path.exists(output_path):
# logger.info(f"创建目录:{output_path}")
# os.makedirs(output_path, True)
# for res in output:
# print(res)
# res.save_to_markdown(save_path=output_path)
# logger.info(f"成功保存到 {output_path}")
img_path = r'D:\dsWork\dsProject\dsLightRag\Test\split_images\a62dce9d67c818accf94113aabefe172\3.png'
from pathlib import Path
from paddleocr import PPStructureV3
pipeline = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
output = pipeline.predict(input=img_path)
for res in output:
res.save_to_markdown(save_path="output")