Files
dsProject/dsLightRag/Util/OCR_URL_2_Split.py
2025-08-14 15:45:08 +08:00

89 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import hashlib
import cv2
import numpy as np
import os
import logging
# 直接获取模块专属日志器(无需重复配置)
logger = logging.getLogger(__name__)
def split_image_by_height_and_blank(
input_path,
output_dir='split_images',
target_height=500, # 目标分割高度
):
"""按高度累积+空白行分割图像
Args:
input_path: 输入图片路径
output_dir: 输出目录
target_height: 目标累积高度
"""
# 计算input_path的md5值
md5 = hashlib.md5(input_path.encode()).hexdigest()
output_dir = output_dir + f"/{md5}"
os.makedirs(output_dir, exist_ok=True)
img = cv2.imread(input_path)
if img is None:
raise FileNotFoundError(f"无法读取图片: {input_path}")
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
height, width = gray.shape
logger.info(f"原始图片尺寸: {width}x{height}")
segments = []
start_y = 0
current_height = 0
blank_line_count = 0 # 空白行计数器
for y in range(height):
current_height += 1
# 达到目标高度开始寻找空白行
if current_height >= target_height:
# 检查当前行是否为空白行(全白)
row = gray[y, :]
# 统计完全白色的像素数值为255
white_pixels = np.sum(row == 255)
# 只有整行都是白色像素才视为空白行
if white_pixels == width:
blank_line_count += 1 # 空白行计数+1
#logger.debug(f"检测到空白行 #{blank_line_count} (行号: {y})")
# 累计到第10个空白行时进行截取
if blank_line_count >= 10:
segments.append((start_y, y))
start_y = y + 1
current_height = 0
blank_line_count = 0 # 重置计数器
# 保存最后一段
if start_y < height:
segments.append((start_y, height))
# 生成子图片
sub_image_paths = []
for i, (sy, ey) in enumerate(segments, 1):
sub_img = img[sy:ey, :]
# 添加10行空白区域在顶部
blank_height = 10
if sub_img.shape[0] > 0 and sub_img.shape[1] > 0: # 确保子图像有效
# 创建空白区域 (高度10, 与子图相同宽度, 3通道BGR)
blank = np.ones((blank_height, sub_img.shape[1], 3), dtype=np.uint8) * 255 # 白色背景
# 垂直拼接空白区域和子图像
sub_img_with_blank = cv2.vconcat([blank, sub_img])
else:
sub_img_with_blank = sub_img # 处理空图像情况
sub_path = os.path.join(output_dir, f"{i}.png")
# 添加PNG压缩级别参数0表示无压缩
cv2.imwrite(sub_path, sub_img_with_blank, [int(cv2.IMWRITE_PNG_COMPRESSION), 0])
sub_image_paths.append(sub_path)
# 更新日志中的尺寸信息,反映添加空白后的实际尺寸
logger.info(f"保存子图片: {sub_path} (尺寸: {sub_img_with_blank.shape[1]}x{sub_img_with_blank.shape[0]})")
logger.info(f"分割完成,共生成{len(sub_image_paths)}个子图片")
return sub_image_paths