You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
import os
|
|
|
|
|
import datetime
|
|
|
|
|
|
|
|
|
|
from paddleocr import PPStructureV3
|
|
|
|
|
|
|
|
|
|
# 安装OCR引擎
|
|
|
|
|
# python -m pip install paddlepaddle paddleocr
|
|
|
|
|
|
|
|
|
|
# 如果使用GPU,请安装GPU版本
|
|
|
|
|
# (1) 卸载CPU版本
|
|
|
|
|
# pip uninstall paddlepaddle
|
|
|
|
|
# pip uninstall paddleocr
|
|
|
|
|
|
|
|
|
|
# (2) 安装GPU版本 【没有成功】
|
|
|
|
|
# pip install paddlepaddle-gpu
|
|
|
|
|
# pip install paddleocr
|
|
|
|
|
from Pptx.Config import ocr_output_dir, markdown_output_dir, time_format
|
|
|
|
|
|
|
|
|
|
# 确保输出目录存在
|
|
|
|
|
os.makedirs(ocr_output_dir, exist_ok=True)
|
|
|
|
|
os.makedirs(markdown_output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
# 初始化OCR引擎
|
|
|
|
|
pipeline = PPStructureV3()
|
|
|
|
|
|
|
|
|
|
# 图片位置
|
|
|
|
|
img_path = r"D:\logs\img"
|
|
|
|
|
|
|
|
|
|
# 先处理下中文文件名的问题,方法是以_分割,取最后一段即可
|
|
|
|
|
for img_file in os.listdir(img_path):
|
|
|
|
|
if len(img_file.split("_")) > 0:
|
|
|
|
|
os.rename(os.path.join(img_path, img_file), os.path.join(img_path, img_file.split("_")[-1]))
|
|
|
|
|
|
|
|
|
|
# 遍历 image_files目录下所有的文件
|
|
|
|
|
for img_file in os.listdir(img_path):
|
|
|
|
|
# 如果不是图片文件,则跳过
|
|
|
|
|
if not img_file.endswith(".png") and not img_file.endswith(".jpg") and not img_file.endswith(".jpeg"):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 构建完整的文件路径
|
|
|
|
|
full_path = os.path.join(img_path, img_file)
|
|
|
|
|
|
|
|
|
|
# 输出处理信息
|
|
|
|
|
print(f"{datetime.datetime.now().strftime(time_format)} 正在处理{img_file}")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 执行OCR识别
|
|
|
|
|
output = pipeline.predict(full_path)
|
|
|
|
|
for res in output:
|
|
|
|
|
res.save_to_markdown(save_path=markdown_output_dir)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"处理图片{img_file}时出错: {str(e)}")
|