You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
import os
|
|
|
|
|
import glob
|
|
|
|
|
import datetime
|
|
|
|
|
from paddleocr import PPStructureV3
|
|
|
|
|
|
|
|
|
|
# 导入配置
|
|
|
|
|
from Config import compressed_images_dir, ocr_output_dir, markdown_output_dir, time_format
|
|
|
|
|
|
|
|
|
|
# 确保输出目录存在
|
|
|
|
|
os.makedirs(ocr_output_dir, exist_ok=True)
|
|
|
|
|
os.makedirs(markdown_output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
# 初始化OCR引擎
|
|
|
|
|
pipeline = PPStructureV3()
|
|
|
|
|
|
|
|
|
|
# 获取所有压缩后的图片
|
|
|
|
|
image_files = glob.glob(os.path.join(compressed_images_dir, "page_*_compressed.jpg"))
|
|
|
|
|
image_files.sort(key=lambda x: int(os.path.basename(x).split('_')[1]))
|
|
|
|
|
|
|
|
|
|
print(f"找到{len(image_files)}个图片文件需要进行OCR识别")
|
|
|
|
|
|
|
|
|
|
# 处理每个图片
|
|
|
|
|
for i, img_path in enumerate(image_files):
|
|
|
|
|
# 从文件名中提取页码
|
|
|
|
|
page_num = os.path.basename(img_path).split('_')[1]
|
|
|
|
|
|
|
|
|
|
# 检查目标目录是否已存在
|
|
|
|
|
markdown_save_path = os.path.join(markdown_output_dir, f"page_{page_num}")
|
|
|
|
|
if os.path.exists(markdown_save_path):
|
|
|
|
|
print(f"{datetime.datetime.now().strftime(time_format)} 第{page_num}页的OCR结果已存在,跳过处理")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 输出处理信息
|
|
|
|
|
print(f"{datetime.datetime.now().strftime(time_format)} 正在处理第{page_num}页的OCR识别")
|
|
|
|
|
|
|
|
|
|
# 执行OCR识别
|
|
|
|
|
output = pipeline.predict(img_path)
|
|
|
|
|
|
|
|
|
|
# 使用save_to_markdown方法保存结果
|
|
|
|
|
for res in output:
|
|
|
|
|
res.save_to_markdown(save_path=markdown_save_path)
|
|
|
|
|
|
|
|
|
|
print(f"第{page_num}页OCR识别完成,结果已保存到: {markdown_save_path}")
|
|
|
|
|
|
|
|
|
|
print(f"所有图片OCR识别完成,结果保存在: {markdown_output_dir}")
|