'commit'

3 weeks ago · ebbb6e99ad
parent a3e3031456
commit ebbb6e99ad
9 changed files with 96 additions and 33 deletions
--- a/dsRag/Pptx/Config.py
+++ b/dsRag/Pptx/Config.py
@ -0,0 +1,9 @@
+# 基础路径配置
+basePath = r'D:\dsWork\dsProject\dsRag'
+
+# 时间格式
+time_format = '%Y-%m-%d %H:%M:%S'
+
+# OCR结果输出配置
+ocr_output_dir = f"{basePath}\\KeCheng\\Txt"
+markdown_output_dir = f"{basePath}\\KeCheng\\Txt"
--- a/dsRag/Pptx/TestOCR.py
+++ b/dsRag/Pptx/TestOCR.py
@ -0,0 +1,46 @@
+import datetime
+import os
+
+from paddleocr import PPStructureV3
+
+# 安装OCR引擎
+# python -m pip install paddlepaddle paddleocr
+
+# 如果使用GPU，请安装GPU版本
+# pip install paddlepaddle-gpu
+from Pptx.Config import ocr_output_dir, markdown_output_dir, time_format
+
+# 确保输出目录存在
+os.makedirs(ocr_output_dir, exist_ok=True)
+os.makedirs(markdown_output_dir, exist_ok=True)
+
+# 初始化OCR引擎
+pipeline = PPStructureV3()
+
+# 图片位置
+image_files=r"D:\dsWork\dsProject\dsRag\Pptx"
+
+# 处理每个图片
+for i, img_path in enumerate(image_files):
+    # 从文件名中提取页码
+    page_num = os.path.basename(img_path).split('_')[1]
+    
+    # 检查目标目录是否已存在
+    markdown_save_path = os.path.join(markdown_output_dir, f"page_{page_num}")
+    if os.path.exists(markdown_save_path):
+        print(f"{datetime.datetime.now().strftime(time_format)} 第{page_num}页的OCR结果已存在，跳过处理")
+        continue
+    
+    # 输出处理信息
+    print(f"{datetime.datetime.now().strftime(time_format)} 正在处理第{page_num}页的OCR识别")
+    
+    # 执行OCR识别
+    output = pipeline.predict(img_path)
+    
+    # 使用save_to_markdown方法保存结果
+    for res in output:
+        res.save_to_markdown(save_path=markdown_save_path)
+    
+    print(f"第{page_num}页OCR识别完成，结果已保存到: {markdown_save_path}")
+
+print(f"所有图片OCR识别完成，结果保存在: {markdown_output_dir}")
--- a/dsRag/Pptx/TestReadPptx.py
+++ b/dsRag/Pptx/TestReadPptx.py
@ -0,0 +1,34 @@
+# conda activate rag
+# pip install pptx2md
+
+import os
+import re
+
+# 源pptx文件
+source_pptx = r'D:\dsWork\dsProject\dsRag\Pptx\楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx'
+# 目标md文件
+target_md = r'C:\logs\output.md'
+# 图片保存路径
+img_path = r'C:\logs\img'
+# 运行命令
+cmd = r'pptx2md ' + source_pptx + ' -o ' + target_md + ' -i ' + img_path
+os.system(cmd)
+
+# 读取target_md内容
+with open(target_md, 'r', encoding='utf-8') as file:
+    # 按行读取
+    for i, line in enumerate(file.readlines()):
+        # 过滤掉所有图片，即以![]开头的行
+        if line.startswith('![') :
+            continue
+        # 通过正则去掉这样的内容，注意不是指整体去除，而是中间的文字需要保留
+        # 1、 <span style="color:#ffff00">
+        # 2、 <span style="color:#000000">
+        # 3、 </span>
+        line = re.sub(r'<span style="color:#[0-9a-fA-F]{6}">|</span>', '', line)
+        line = line.replace("__", "")
+        line = line.replace("---", "")
+        line = line.replace(" ", "")
+        line = line.replace("\\", "")
+        if line.strip():
+            print(f"{line.strip()}")
--- a/dsRag/Pptx/init.py
+++ b/dsRag/Pptx/init.py
--- a/dsRag/Pptx/pycache/Config.cpython-310.pyc
+++ b/dsRag/Pptx/pycache/Config.cpython-310.pyc
--- a/dsRag/Pptx/pycache/init.cpython-310.pyc
+++ b/dsRag/Pptx/pycache/init.cpython-310.pyc
--- a/dsRag/Pptx/文档.md
+++ b/dsRag/Pptx/文档.md
@ -0,0 +1,7 @@
+https://github.com/PaddlePaddle/PaddleOCR
+
+【2023.4.29】OCR识别__百度飞桨PaddleOCR测试及环境搭建详解
+https://www.bilibili.com/video/BV1w14y1Z7bD/?vd_source=13b33731bb79a73783e9f2c0e11857ae
+
+【安装文档】
+https://github.com/PaddlePaddle/PaddleOCR/blob/main/docs/quick_start.md
--- a/dsRag/Pptx/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx
+++ b/dsRag/Pptx/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx
--- a/dsRag/Test/TestReadPptx.py
+++ b/dsRag/Test/TestReadPptx.py
@ -1,33 +0,0 @@
-# conda activate rag
-# pip install pptx2md
-
-import os
-import re
-
-# 源pptx文件
-source_pptx = r'D:\dsWork\dsProject\dsRag\Test\楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx'
-# 目标md文件
-target_md = r'C:\logs\output.md'
-# 图片保存路径
-img_path = r'C:\logs\img'
-# 运行命令
-cmd = r'pptx2md ' + source_pptx + ' -o ' + target_md + ' -i ' + img_path
-os.system(cmd)
-
-# 读取target_md内容
-with open(target_md, 'r', encoding='utf-8') as file:
-    # 按行读取
-    for i, line in enumerate(file.readlines()):
-        # 过滤掉所有图片，即以![]开头的行
-        if not line.startswith('![') and not line.startswith('<img'):
-            # 通过正则去掉这样的内容，注意不是指整体去除，而是中间的文字需要保留
-            # 1、 <span style="color:#ffff00">
-            # 2、 <span style="color:#000000">
-            # 3、 </span>
-            line = re.sub(r'<span style="color:#[0-9a-fA-F]{6}">|</span>', '', line)
-            line = line.replace("__", "")
-            line = line.replace("---", "")
-            line = line.replace(" ", "")
-            line = line.replace("\\", "")
-            if line.strip():
-                print(f"{line.strip()}")