93 lines
3.6 KiB
Python
93 lines
3.6 KiB
Python
|
import asyncio
|
|||
|
import hashlib
|
|||
|
import logging
|
|||
|
import os
|
|||
|
import tempfile
|
|||
|
|
|||
|
from Util.OCR_URL_1_Shot import start_shot
|
|||
|
from Util.OCR_URL_2_Split import split_image_by_height_and_blank
|
|||
|
from Util.OCR_URL_3_YoloCut import yolo_cut
|
|||
|
#from Util.OCR_URL_4_Paddle import ocrWithPPStructureV3
|
|||
|
|
|||
|
|
|||
|
# 在主文件顶部配置根日志(仅需配置一次)
|
|||
|
def setup_root_logger():
|
|||
|
# 确保日志目录存在
|
|||
|
log_dir = 'Logs'
|
|||
|
os.makedirs(log_dir, exist_ok=True)
|
|||
|
log_file = os.path.join(log_dir, 'merge.log')
|
|||
|
|
|||
|
# 创建格式化器
|
|||
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|||
|
|
|||
|
# 文件处理器
|
|||
|
file_handler = logging.FileHandler(log_file, encoding='utf-8')
|
|||
|
file_handler.setFormatter(formatter)
|
|||
|
file_handler.setLevel(logging.INFO)
|
|||
|
|
|||
|
# 控制台处理器
|
|||
|
console_handler = logging.StreamHandler()
|
|||
|
console_handler.setFormatter(formatter)
|
|||
|
console_handler.setLevel(logging.DEBUG)
|
|||
|
|
|||
|
# 配置根日志
|
|||
|
root_logger = logging.getLogger()
|
|||
|
root_logger.setLevel(logging.DEBUG)
|
|||
|
root_logger.addHandler(file_handler)
|
|||
|
root_logger.addHandler(console_handler)
|
|||
|
|
|||
|
|
|||
|
# 初始化日志配置
|
|||
|
setup_root_logger()
|
|||
|
# 获取主文件日志器
|
|||
|
logger = logging.getLogger(__name__)
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
logger.info("开始执行合并流程") # 使用统一日志器
|
|||
|
# 1、抓取网页图片
|
|||
|
url = "https://mp.weixin.qq.com/s?__biz=MzI1NjYzNjE1NQ==&mid=2247540913&idx=2&sn=7a061ec4c7dbbcc94b8bf2fa7d93f4c9&chksm=ea21c525dd564c33b578037e893e5190b92841f3db0837191864591bb3da0e10d8af7ce5da10&scene=27"
|
|||
|
# 系统临时目录
|
|||
|
temp_dir = tempfile.gettempdir()
|
|||
|
os.makedirs(temp_dir, exist_ok=True)
|
|||
|
# 生成UUID作为文件名(不带破折号)
|
|||
|
url_md5 = hashlib.md5(url.encode()).hexdigest() # input_path的md5值
|
|||
|
uuid_filename = f"{url_md5}.png"
|
|||
|
# 完整文件路径
|
|||
|
input_path = os.path.join(temp_dir, uuid_filename)
|
|||
|
try:
|
|||
|
# 1、运行异步函数进行抓取
|
|||
|
asyncio.run(start_shot(url, input_path))
|
|||
|
|
|||
|
# 2、运行分割函数
|
|||
|
sub_images = split_image_by_height_and_blank(
|
|||
|
input_path=input_path,
|
|||
|
output_dir="split_images",
|
|||
|
target_height=500
|
|||
|
)
|
|||
|
|
|||
|
# 3、YOLO识别并分割图片
|
|||
|
md5 = hashlib.md5(input_path.encode()).hexdigest() # input_path的md5值
|
|||
|
yolo_path = rf'D:\dsWork\dsProject\dsLightRag\Test\split_images\{md5}'
|
|||
|
# 遍历yolo_path下的所有图片
|
|||
|
cnt = 0
|
|||
|
sum = len(os.listdir(yolo_path))
|
|||
|
todoTxtImgList = []
|
|||
|
for filename in os.listdir(yolo_path):
|
|||
|
cnt += 1
|
|||
|
if filename.endswith('.jpg') or filename.endswith('.png'):
|
|||
|
image_path = os.path.join(yolo_path, filename)
|
|||
|
logger.info(f"开始处理图片: {image_path},第{cnt}/共{sum}个。")
|
|||
|
OUTPUT_DIR = yolo_cut(image_path, md5)
|
|||
|
for filename in os.listdir(OUTPUT_DIR):
|
|||
|
f = f"{os.path.abspath(OUTPUT_DIR)}/{filename}"
|
|||
|
if '_TXT' in filename and f not in todoTxtImgList:
|
|||
|
todoTxtImgList.append(f)
|
|||
|
logger.info(f"图片处理完成: {image_path},第{cnt}/共{sum}个。")
|
|||
|
# 4、对于所有以TXT结尾的文件,执行OCR处理
|
|||
|
#if len(todoTxtImgList) > 0:
|
|||
|
# logger.info(f"开始执行OCR处理,共{len(todoTxtImgList)}个文件。")
|
|||
|
# ocrWithPPStructureV3(todoTxtImgList)
|
|||
|
finally:
|
|||
|
# 删除输入图片
|
|||
|
os.remove(input_path)
|