dsProject/dsLightRag/Util/OCR_URL_2_Split.py

import hashlib

import cv2
import numpy as np
import os
import logging

# 直接获取模块专属日志器（无需重复配置）
logger = logging.getLogger(__name__)


def split_image_by_height_and_blank(
        input_path,
        output_dir='split_images',
        target_height=500,  # 目标分割高度
):
    """按高度累积+空白行分割图像
    Args:
        input_path: 输入图片路径
        output_dir: 输出目录
        target_height: 目标累积高度
    """
    # 计算input_path的md5值
    md5 = hashlib.md5(input_path.encode()).hexdigest()
    output_dir = output_dir + f"/{md5}"
    os.makedirs(output_dir, exist_ok=True)
    img = cv2.imread(input_path)
    if img is None:
        raise FileNotFoundError(f"无法读取图片: {input_path}")

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    height, width = gray.shape
    logger.info(f"原始图片尺寸: {width}x{height}")

    segments = []
    start_y = 0
    current_height = 0
    blank_line_count = 0  # 空白行计数器

    for y in range(height):
        current_height += 1

        # 达到目标高度开始寻找空白行
        if current_height >= target_height:
            # 检查当前行是否为空白行（全白）
            row = gray[y, :]
            # 统计完全白色的像素数（值为255）
            white_pixels = np.sum(row == 255)
            # 只有整行都是白色像素才视为空白行
            if white_pixels == width:
                blank_line_count += 1  # 空白行计数+1
                #logger.debug(f"检测到空白行 #{blank_line_count} (行号: {y})")

                # 累计到第10个空白行时进行截取
                if blank_line_count >= 10:
                    segments.append((start_y, y))
                    start_y = y + 1
                    current_height = 0
                    blank_line_count = 0  # 重置计数器

    # 保存最后一段
    if start_y < height:
        segments.append((start_y, height))

    # 生成子图片
    sub_image_paths = []
    for i, (sy, ey) in enumerate(segments, 1):
        sub_img = img[sy:ey, :]

        # 添加10行空白区域在顶部
        blank_height = 10
        if sub_img.shape[0] > 0 and sub_img.shape[1] > 0:  # 确保子图像有效
            # 创建空白区域 (高度10, 与子图相同宽度, 3通道BGR)
            blank = np.ones((blank_height, sub_img.shape[1], 3), dtype=np.uint8) * 255  # 白色背景
            # 垂直拼接空白区域和子图像
            sub_img_with_blank = cv2.vconcat([blank, sub_img])
        else:
            sub_img_with_blank = sub_img  # 处理空图像情况

        sub_path = os.path.join(output_dir, f"{i}.png")
        # 添加PNG压缩级别参数，0表示无压缩
        cv2.imwrite(sub_path, sub_img_with_blank, [int(cv2.IMWRITE_PNG_COMPRESSION), 0])
        sub_image_paths.append(sub_path)
        # 更新日志中的尺寸信息，反映添加空白后的实际尺寸
        logger.info(f"保存子图片: {sub_path} (尺寸: {sub_img_with_blank.shape[1]}x{sub_img_with_blank.shape[0]})")

        logger.info(f"分割完成，共生成{len(sub_image_paths)}个子图片")
    return sub_image_paths