Files
dsProject/dsLightRag/Test/P5_Merge.py

93 lines
3.6 KiB
Python
Raw Normal View History

2025-08-14 15:45:08 +08:00
import asyncio
import hashlib
import logging
import os
import tempfile
from Util.OCR_URL_1_Shot import start_shot
from Util.OCR_URL_2_Split import split_image_by_height_and_blank
from Util.OCR_URL_3_YoloCut import yolo_cut
#from Util.OCR_URL_4_Paddle import ocrWithPPStructureV3
# 在主文件顶部配置根日志(仅需配置一次)
def setup_root_logger():
# 确保日志目录存在
log_dir = 'Logs'
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, 'merge.log')
# 创建格式化器
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# 文件处理器
file_handler = logging.FileHandler(log_file, encoding='utf-8')
file_handler.setFormatter(formatter)
file_handler.setLevel(logging.INFO)
# 控制台处理器
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
console_handler.setLevel(logging.DEBUG)
# 配置根日志
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)
root_logger.addHandler(file_handler)
root_logger.addHandler(console_handler)
# 初始化日志配置
setup_root_logger()
# 获取主文件日志器
logger = logging.getLogger(__name__)
if __name__ == '__main__':
logger.info("开始执行合并流程") # 使用统一日志器
# 1、抓取网页图片
url = "https://mp.weixin.qq.com/s?__biz=MzI1NjYzNjE1NQ==&mid=2247540913&idx=2&sn=7a061ec4c7dbbcc94b8bf2fa7d93f4c9&chksm=ea21c525dd564c33b578037e893e5190b92841f3db0837191864591bb3da0e10d8af7ce5da10&scene=27"
# 系统临时目录
temp_dir = tempfile.gettempdir()
os.makedirs(temp_dir, exist_ok=True)
# 生成UUID作为文件名不带破折号
url_md5 = hashlib.md5(url.encode()).hexdigest() # input_path的md5值
uuid_filename = f"{url_md5}.png"
# 完整文件路径
input_path = os.path.join(temp_dir, uuid_filename)
try:
# 1、运行异步函数进行抓取
asyncio.run(start_shot(url, input_path))
# 2、运行分割函数
sub_images = split_image_by_height_and_blank(
input_path=input_path,
output_dir="split_images",
target_height=500
)
# 3、YOLO识别并分割图片
md5 = hashlib.md5(input_path.encode()).hexdigest() # input_path的md5值
yolo_path = rf'D:\dsWork\dsProject\dsLightRag\Test\split_images\{md5}'
# 遍历yolo_path下的所有图片
cnt = 0
sum = len(os.listdir(yolo_path))
todoTxtImgList = []
for filename in os.listdir(yolo_path):
cnt += 1
if filename.endswith('.jpg') or filename.endswith('.png'):
image_path = os.path.join(yolo_path, filename)
logger.info(f"开始处理图片: {image_path},第{cnt}/共{sum}个。")
OUTPUT_DIR = yolo_cut(image_path, md5)
for filename in os.listdir(OUTPUT_DIR):
f = f"{os.path.abspath(OUTPUT_DIR)}/{filename}"
if '_TXT' in filename and f not in todoTxtImgList:
todoTxtImgList.append(f)
logger.info(f"图片处理完成: {image_path},第{cnt}/共{sum}个。")
# 4、对于所有以TXT结尾的文件执行OCR处理
#if len(todoTxtImgList) > 0:
# logger.info(f"开始执行OCR处理共{len(todoTxtImgList)}个文件。")
# ocrWithPPStructureV3(todoTxtImgList)
finally:
# 删除输入图片
os.remove(input_path)