dsProject/dsLightRag/Test/P5_Merge.py

import asyncio
import hashlib
import logging
import os
import tempfile

from Util.OCR_URL_1_Shot import start_shot
from Util.OCR_URL_2_Split import split_image_by_height_and_blank
from Util.OCR_URL_3_YoloCut import yolo_cut
#from Util.OCR_URL_4_Paddle import ocrWithPPStructureV3


# 在主文件顶部配置根日志（仅需配置一次）
def setup_root_logger():
    # 确保日志目录存在
    log_dir = 'Logs'
    os.makedirs(log_dir, exist_ok=True)
    log_file = os.path.join(log_dir, 'merge.log')

    # 创建格式化器
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    # 文件处理器
    file_handler = logging.FileHandler(log_file, encoding='utf-8')
    file_handler.setFormatter(formatter)
    file_handler.setLevel(logging.INFO)

    # 控制台处理器
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)
    console_handler.setLevel(logging.DEBUG)

    # 配置根日志
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)
    root_logger.addHandler(file_handler)
    root_logger.addHandler(console_handler)


# 初始化日志配置
setup_root_logger()
# 获取主文件日志器
logger = logging.getLogger(__name__)

if __name__ == '__main__':
    logger.info("开始执行合并流程")  # 使用统一日志器
    # 1、抓取网页图片
    url = "https://mp.weixin.qq.com/s?__biz=MzI1NjYzNjE1NQ==&mid=2247540913&idx=2&sn=7a061ec4c7dbbcc94b8bf2fa7d93f4c9&chksm=ea21c525dd564c33b578037e893e5190b92841f3db0837191864591bb3da0e10d8af7ce5da10&scene=27"
    # 系统临时目录
    temp_dir = tempfile.gettempdir()
    os.makedirs(temp_dir, exist_ok=True)
    # 生成UUID作为文件名（不带破折号）
    url_md5 = hashlib.md5(url.encode()).hexdigest()  # input_path的md5值
    uuid_filename = f"{url_md5}.png"
    # 完整文件路径
    input_path = os.path.join(temp_dir, uuid_filename)
    try:
        # 1、运行异步函数进行抓取
        asyncio.run(start_shot(url, input_path))

        # 2、运行分割函数
        sub_images = split_image_by_height_and_blank(
            input_path=input_path,
            output_dir="split_images",
            target_height=500
        )

        # 3、YOLO识别并分割图片
        md5 = hashlib.md5(input_path.encode()).hexdigest()  # input_path的md5值
        yolo_path = rf'D:\dsWork\dsProject\dsLightRag\Test\split_images\{md5}'
        # 遍历yolo_path下的所有图片
        cnt = 0
        sum = len(os.listdir(yolo_path))
        todoTxtImgList = []
        for filename in os.listdir(yolo_path):
            cnt += 1
            if filename.endswith('.jpg') or filename.endswith('.png'):
                image_path = os.path.join(yolo_path, filename)
                logger.info(f"开始处理图片: {image_path},第{cnt}/共{sum}个。")
                OUTPUT_DIR = yolo_cut(image_path, md5)
                for filename in os.listdir(OUTPUT_DIR):
                    f = f"{os.path.abspath(OUTPUT_DIR)}/{filename}"
                    if '_TXT' in filename and f not in todoTxtImgList:
                        todoTxtImgList.append(f)
                logger.info(f"图片处理完成: {image_path}，第{cnt}/共{sum}个。")
        # 4、对于所有以TXT结尾的文件，执行OCR处理
        #if len(todoTxtImgList) > 0:
        #    logger.info(f"开始执行OCR处理，共{len(todoTxtImgList)}个文件。")
        #    ocrWithPPStructureV3(todoTxtImgList)
    finally:
        # 删除输入图片
        os.remove(input_path)