dsProject/dsLightRag/Util/OCR_URL_4_Paddle.py

import logging

# 直接获取模块专属日志器（无需重复配置）
logger = logging.getLogger(__name__)

# 创建模型
# pipeline = PPStructureV3(use_doc_orientation_classify=False,use_doc_unwarping=False)


# def ocrWithPPStructureV3(image_paths: array):
#     # 对给定图片进行OCR识别
#     for image_path in image_paths:
#         logger.info(f"正在识别图片：{image_path}")
#         md5 = image_path.replace("\\", "/").split("/")[-2]
#         output = pipeline.predict(input=image_path)
#         output_path = f"output/{md5}"
#         # 如果输出目录不存在，则创建
#         if not os.path.exists(output_path):
#             logger.info(f"创建目录：{output_path}")
#             os.makedirs(output_path, True)
#         for res in output:
#             print(res)
#             res.save_to_markdown(save_path=output_path)
#         logger.info(f"成功保存到 {output_path}")

img_path = r'D:\dsWork\dsProject\dsLightRag\Test\split_images\a62dce9d67c818accf94113aabefe172\3.png'

from pathlib import Path
from paddleocr import PPStructureV3

pipeline = PPStructureV3(
    use_doc_orientation_classify=False,
    use_doc_unwarping=False
)

output = pipeline.predict(input=img_path)

for res in output:
    res.save_to_markdown(save_path="output")