Files
dsProject/dsLightRag/Test/P5_Merge.py
2025-08-14 15:45:08 +08:00

93 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import hashlib
import logging
import os
import tempfile
from Util.OCR_URL_1_Shot import start_shot
from Util.OCR_URL_2_Split import split_image_by_height_and_blank
from Util.OCR_URL_3_YoloCut import yolo_cut
#from Util.OCR_URL_4_Paddle import ocrWithPPStructureV3
# 在主文件顶部配置根日志(仅需配置一次)
def setup_root_logger():
# 确保日志目录存在
log_dir = 'Logs'
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, 'merge.log')
# 创建格式化器
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# 文件处理器
file_handler = logging.FileHandler(log_file, encoding='utf-8')
file_handler.setFormatter(formatter)
file_handler.setLevel(logging.INFO)
# 控制台处理器
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
console_handler.setLevel(logging.DEBUG)
# 配置根日志
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)
root_logger.addHandler(file_handler)
root_logger.addHandler(console_handler)
# 初始化日志配置
setup_root_logger()
# 获取主文件日志器
logger = logging.getLogger(__name__)
if __name__ == '__main__':
logger.info("开始执行合并流程") # 使用统一日志器
# 1、抓取网页图片
url = "https://mp.weixin.qq.com/s?__biz=MzI1NjYzNjE1NQ==&mid=2247540913&idx=2&sn=7a061ec4c7dbbcc94b8bf2fa7d93f4c9&chksm=ea21c525dd564c33b578037e893e5190b92841f3db0837191864591bb3da0e10d8af7ce5da10&scene=27"
# 系统临时目录
temp_dir = tempfile.gettempdir()
os.makedirs(temp_dir, exist_ok=True)
# 生成UUID作为文件名不带破折号
url_md5 = hashlib.md5(url.encode()).hexdigest() # input_path的md5值
uuid_filename = f"{url_md5}.png"
# 完整文件路径
input_path = os.path.join(temp_dir, uuid_filename)
try:
# 1、运行异步函数进行抓取
asyncio.run(start_shot(url, input_path))
# 2、运行分割函数
sub_images = split_image_by_height_and_blank(
input_path=input_path,
output_dir="split_images",
target_height=500
)
# 3、YOLO识别并分割图片
md5 = hashlib.md5(input_path.encode()).hexdigest() # input_path的md5值
yolo_path = rf'D:\dsWork\dsProject\dsLightRag\Test\split_images\{md5}'
# 遍历yolo_path下的所有图片
cnt = 0
sum = len(os.listdir(yolo_path))
todoTxtImgList = []
for filename in os.listdir(yolo_path):
cnt += 1
if filename.endswith('.jpg') or filename.endswith('.png'):
image_path = os.path.join(yolo_path, filename)
logger.info(f"开始处理图片: {image_path},第{cnt}/共{sum}个。")
OUTPUT_DIR = yolo_cut(image_path, md5)
for filename in os.listdir(OUTPUT_DIR):
f = f"{os.path.abspath(OUTPUT_DIR)}/{filename}"
if '_TXT' in filename and f not in todoTxtImgList:
todoTxtImgList.append(f)
logger.info(f"图片处理完成: {image_path},第{cnt}/共{sum}个。")
# 4、对于所有以TXT结尾的文件执行OCR处理
#if len(todoTxtImgList) > 0:
# logger.info(f"开始执行OCR处理共{len(todoTxtImgList)}个文件。")
# ocrWithPPStructureV3(todoTxtImgList)
finally:
# 删除输入图片
os.remove(input_path)