Files
dsProject/dsLightRag/Util/OCR_URL_2_Split.py

89 lines
3.2 KiB
Python
Raw Normal View History

2025-08-14 15:45:08 +08:00
import hashlib
import cv2
import numpy as np
import os
import logging
# 直接获取模块专属日志器(无需重复配置)
logger = logging.getLogger(__name__)
def split_image_by_height_and_blank(
input_path,
output_dir='split_images',
target_height=500, # 目标分割高度
):
"""按高度累积+空白行分割图像
Args:
input_path: 输入图片路径
output_dir: 输出目录
target_height: 目标累积高度
"""
# 计算input_path的md5值
md5 = hashlib.md5(input_path.encode()).hexdigest()
output_dir = output_dir + f"/{md5}"
os.makedirs(output_dir, exist_ok=True)
img = cv2.imread(input_path)
if img is None:
raise FileNotFoundError(f"无法读取图片: {input_path}")
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
height, width = gray.shape
logger.info(f"原始图片尺寸: {width}x{height}")
segments = []
start_y = 0
current_height = 0
blank_line_count = 0 # 空白行计数器
for y in range(height):
current_height += 1
# 达到目标高度开始寻找空白行
if current_height >= target_height:
# 检查当前行是否为空白行(全白)
row = gray[y, :]
# 统计完全白色的像素数值为255
white_pixels = np.sum(row == 255)
# 只有整行都是白色像素才视为空白行
if white_pixels == width:
blank_line_count += 1 # 空白行计数+1
#logger.debug(f"检测到空白行 #{blank_line_count} (行号: {y})")
# 累计到第10个空白行时进行截取
if blank_line_count >= 10:
segments.append((start_y, y))
start_y = y + 1
current_height = 0
blank_line_count = 0 # 重置计数器
# 保存最后一段
if start_y < height:
segments.append((start_y, height))
# 生成子图片
sub_image_paths = []
for i, (sy, ey) in enumerate(segments, 1):
sub_img = img[sy:ey, :]
# 添加10行空白区域在顶部
blank_height = 10
if sub_img.shape[0] > 0 and sub_img.shape[1] > 0: # 确保子图像有效
# 创建空白区域 (高度10, 与子图相同宽度, 3通道BGR)
blank = np.ones((blank_height, sub_img.shape[1], 3), dtype=np.uint8) * 255 # 白色背景
# 垂直拼接空白区域和子图像
sub_img_with_blank = cv2.vconcat([blank, sub_img])
else:
sub_img_with_blank = sub_img # 处理空图像情况
sub_path = os.path.join(output_dir, f"{i}.png")
# 添加PNG压缩级别参数0表示无压缩
cv2.imwrite(sub_path, sub_img_with_blank, [int(cv2.IMWRITE_PNG_COMPRESSION), 0])
sub_image_paths.append(sub_path)
# 更新日志中的尺寸信息,反映添加空白后的实际尺寸
logger.info(f"保存子图片: {sub_path} (尺寸: {sub_img_with_blank.shape[1]}x{sub_img_with_blank.shape[0]})")
logger.info(f"分割完成,共生成{len(sub_image_paths)}个子图片")
return sub_image_paths