'commit'

3 weeks ago · ebbb6e99ad
parent a3e3031456
commit ebbb6e99ad
9 changed files with 96 additions and 33 deletions
--- a/dsRag/Pptx/Config.py
+++ b/dsRag/Pptx/Config.py
@ -0,0 +1,9 @@
 # 基础路径配置
 basePath = r'D:\dsWork\dsProject\dsRag'
 # 时间格式
 time_format = '%Y-%m-%d %H:%M:%S'
 # OCR结果输出配置
 ocr_output_dir = f"{basePath}\\KeCheng\\Txt"
 markdown_output_dir = f"{basePath}\\KeCheng\\Txt"
--- a/dsRag/Pptx/TestOCR.py
+++ b/dsRag/Pptx/TestOCR.py
@ -0,0 +1,46 @@
 import datetime
 import os
 from paddleocr import PPStructureV3
 # 安装OCR引擎
 # python -m pip install paddlepaddle paddleocr
 # 如果使用GPU，请安装GPU版本
 # pip install paddlepaddle-gpu
 from Pptx.Config import ocr_output_dir, markdown_output_dir, time_format
 # 确保输出目录存在
 os.makedirs(ocr_output_dir, exist_ok=True)
 os.makedirs(markdown_output_dir, exist_ok=True)
 # 初始化OCR引擎
 pipeline = PPStructureV3()
 # 图片位置
 image_files=r"D:\dsWork\dsProject\dsRag\Pptx"
 # 处理每个图片
 for i, img_path in enumerate(image_files):
    # 从文件名中提取页码
    page_num = os.path.basename(img_path).split('_')[1]
    # 检查目标目录是否已存在
    markdown_save_path = os.path.join(markdown_output_dir, f"page_{page_num}")
    if os.path.exists(markdown_save_path):
        print(f"{datetime.datetime.now().strftime(time_format)} 第{page_num}页的OCR结果已存在，跳过处理")
        continue
    # 输出处理信息
    print(f"{datetime.datetime.now().strftime(time_format)} 正在处理第{page_num}页的OCR识别")
    # 执行OCR识别
    output = pipeline.predict(img_path)
    # 使用save_to_markdown方法保存结果
    for res in output:
        res.save_to_markdown(save_path=markdown_save_path)
    print(f"第{page_num}页OCR识别完成，结果已保存到: {markdown_save_path}")
 print(f"所有图片OCR识别完成，结果保存在: {markdown_output_dir}")
--- a/dsRag/Pptx/TestReadPptx.py
+++ b/dsRag/Pptx/TestReadPptx.py
@ -0,0 +1,34 @@
 # conda activate rag
 # pip install pptx2md
 import os
 import re
 # 源pptx文件
 source_pptx = r'D:\dsWork\dsProject\dsRag\Pptx\楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx'
 # 目标md文件
 target_md = r'C:\logs\output.md'
 # 图片保存路径
 img_path = r'C:\logs\img'
 # 运行命令
 cmd = r'pptx2md ' + source_pptx + ' -o ' + target_md + ' -i ' + img_path
 os.system(cmd)
 # 读取target_md内容
 with open(target_md, 'r', encoding='utf-8') as file:
    # 按行读取
    for i, line in enumerate(file.readlines()):
        # 过滤掉所有图片，即以![]开头的行
        if line.startswith('![') :
            continue
        # 通过正则去掉这样的内容，注意不是指整体去除，而是中间的文字需要保留
        # 1、 <span style="color:#ffff00">
        # 2、 <span style="color:#000000">
        # 3、 </span>
        line = re.sub(r'<span style="color:#[0-9a-fA-F]{6}">|</span>', '', line)
        line = line.replace("__", "")
        line = line.replace("---", "")
        line = line.replace(" ", "")
        line = line.replace("\\", "")
        if line.strip():
            print(f"{line.strip()}")
--- a/dsRag/Pptx/init.py
+++ b/dsRag/Pptx/init.py
--- a/dsRag/Pptx/pycache/Config.cpython-310.pyc
+++ b/dsRag/Pptx/pycache/Config.cpython-310.pyc
--- a/dsRag/Pptx/pycache/init.cpython-310.pyc
+++ b/dsRag/Pptx/pycache/init.cpython-310.pyc
--- a/dsRag/Pptx/文档.md
+++ b/dsRag/Pptx/文档.md
@ -0,0 +1,7 @@
 https://github.com/PaddlePaddle/PaddleOCR
 【2023.4.29】OCR识别__百度飞桨PaddleOCR测试及环境搭建详解
 https://www.bilibili.com/video/BV1w14y1Z7bD/?vd_source=13b33731bb79a73783e9f2c0e11857ae
 【安装文档】
 https://github.com/PaddlePaddle/PaddleOCR/blob/main/docs/quick_start.md
--- a/dsRag/Pptx/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx
+++ b/dsRag/Pptx/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx
--- a/dsRag/Test/TestReadPptx.py
+++ b/dsRag/Test/TestReadPptx.py
@ -1,33 +0,0 @@
 # conda activate rag
 # pip install pptx2md
 import os
 import re
 # 源pptx文件
 source_pptx = r'D:\dsWork\dsProject\dsRag\Test\楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx'
 # 目标md文件
 target_md = r'C:\logs\output.md'
 # 图片保存路径
 img_path = r'C:\logs\img'
 # 运行命令
 cmd = r'pptx2md ' + source_pptx + ' -o ' + target_md + ' -i ' + img_path
 os.system(cmd)
 # 读取target_md内容
 with open(target_md, 'r', encoding='utf-8') as file:
    # 按行读取
    for i, line in enumerate(file.readlines()):
        # 过滤掉所有图片，即以![]开头的行
        if not line.startswith('![') and not line.startswith('<img'):
            # 通过正则去掉这样的内容，注意不是指整体去除，而是中间的文字需要保留
            # 1、 <span style="color:#ffff00">
            # 2、 <span style="color:#000000">
            # 3、 </span>
            line = re.sub(r'<span style="color:#[0-9a-fA-F]{6}">|</span>', '', line)
            line = line.replace("__", "")
            line = line.replace("---", "")
            line = line.replace(" ", "")
            line = line.replace("\\", "")
            if line.strip():
                print(f"{line.strip()}")