diff --git a/dsRag/Pptx/Config.py b/dsRag/Pptx/Config.py new file mode 100644 index 00000000..6ab1660a --- /dev/null +++ b/dsRag/Pptx/Config.py @@ -0,0 +1,9 @@ +# 基础路径配置 +basePath = r'D:\dsWork\dsProject\dsRag' + +# 时间格式 +time_format = '%Y-%m-%d %H:%M:%S' + +# OCR结果输出配置 +ocr_output_dir = f"{basePath}\\KeCheng\\Txt" +markdown_output_dir = f"{basePath}\\KeCheng\\Txt" \ No newline at end of file diff --git a/dsRag/Pptx/TestOCR.py b/dsRag/Pptx/TestOCR.py new file mode 100644 index 00000000..610d31e3 --- /dev/null +++ b/dsRag/Pptx/TestOCR.py @@ -0,0 +1,46 @@ +import datetime +import os + +from paddleocr import PPStructureV3 + +# 安装OCR引擎 +# python -m pip install paddlepaddle paddleocr + +# 如果使用GPU,请安装GPU版本 +# pip install paddlepaddle-gpu +from Pptx.Config import ocr_output_dir, markdown_output_dir, time_format + +# 确保输出目录存在 +os.makedirs(ocr_output_dir, exist_ok=True) +os.makedirs(markdown_output_dir, exist_ok=True) + +# 初始化OCR引擎 +pipeline = PPStructureV3() + +# 图片位置 +image_files=r"D:\dsWork\dsProject\dsRag\Pptx" + +# 处理每个图片 +for i, img_path in enumerate(image_files): + # 从文件名中提取页码 + page_num = os.path.basename(img_path).split('_')[1] + + # 检查目标目录是否已存在 + markdown_save_path = os.path.join(markdown_output_dir, f"page_{page_num}") + if os.path.exists(markdown_save_path): + print(f"{datetime.datetime.now().strftime(time_format)} 第{page_num}页的OCR结果已存在,跳过处理") + continue + + # 输出处理信息 + print(f"{datetime.datetime.now().strftime(time_format)} 正在处理第{page_num}页的OCR识别") + + # 执行OCR识别 + output = pipeline.predict(img_path) + + # 使用save_to_markdown方法保存结果 + for res in output: + res.save_to_markdown(save_path=markdown_save_path) + + print(f"第{page_num}页OCR识别完成,结果已保存到: {markdown_save_path}") + +print(f"所有图片OCR识别完成,结果保存在: {markdown_output_dir}") \ No newline at end of file diff --git a/dsRag/Pptx/TestReadPptx.py b/dsRag/Pptx/TestReadPptx.py new file mode 100644 index 00000000..4f588c1a --- /dev/null +++ b/dsRag/Pptx/TestReadPptx.py @@ -0,0 +1,34 @@ +# conda activate rag +# pip install pptx2md + +import os +import re + +# 源pptx文件 +source_pptx = r'D:\dsWork\dsProject\dsRag\Pptx\楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx' +# 目标md文件 +target_md = r'C:\logs\output.md' +# 图片保存路径 +img_path = r'C:\logs\img' +# 运行命令 +cmd = r'pptx2md ' + source_pptx + ' -o ' + target_md + ' -i ' + img_path +os.system(cmd) + +# 读取target_md内容 +with open(target_md, 'r', encoding='utf-8') as file: + # 按行读取 + for i, line in enumerate(file.readlines()): + # 过滤掉所有图片,即以![]开头的行 + if line.startswith('![') : + continue + # 通过正则去掉这样的内容,注意不是指整体去除,而是中间的文字需要保留 + # 1、 + # 2、 + # 3、 + line = re.sub(r'|', '', line) + line = line.replace("__", "") + line = line.replace("---", "") + line = line.replace(" ", "") + line = line.replace("\\", "") + if line.strip(): + print(f"{line.strip()}") diff --git a/dsRag/Pptx/__init__.py b/dsRag/Pptx/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsRag/Pptx/__pycache__/Config.cpython-310.pyc b/dsRag/Pptx/__pycache__/Config.cpython-310.pyc new file mode 100644 index 00000000..dcca2fe9 Binary files /dev/null and b/dsRag/Pptx/__pycache__/Config.cpython-310.pyc differ diff --git a/dsRag/Pptx/__pycache__/__init__.cpython-310.pyc b/dsRag/Pptx/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 00000000..ecaa418c Binary files /dev/null and b/dsRag/Pptx/__pycache__/__init__.cpython-310.pyc differ diff --git a/dsRag/Pptx/文档.md b/dsRag/Pptx/文档.md new file mode 100644 index 00000000..ba3331a2 --- /dev/null +++ b/dsRag/Pptx/文档.md @@ -0,0 +1,7 @@ +https://github.com/PaddlePaddle/PaddleOCR + +【2023.4.29】OCR识别__百度飞桨PaddleOCR测试及环境搭建详解 +https://www.bilibili.com/video/BV1w14y1Z7bD/?vd_source=13b33731bb79a73783e9f2c0e11857ae + +【安装文档】 +https://github.com/PaddlePaddle/PaddleOCR/blob/main/docs/quick_start.md \ No newline at end of file diff --git a/dsRag/Test/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx b/dsRag/Pptx/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx similarity index 100% rename from dsRag/Test/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx rename to dsRag/Pptx/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx diff --git a/dsRag/Test/TestReadPptx.py b/dsRag/Test/TestReadPptx.py deleted file mode 100644 index b5b2f8a1..00000000 --- a/dsRag/Test/TestReadPptx.py +++ /dev/null @@ -1,33 +0,0 @@ -# conda activate rag -# pip install pptx2md - -import os -import re - -# 源pptx文件 -source_pptx = r'D:\dsWork\dsProject\dsRag\Test\楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx' -# 目标md文件 -target_md = r'C:\logs\output.md' -# 图片保存路径 -img_path = r'C:\logs\img' -# 运行命令 -cmd = r'pptx2md ' + source_pptx + ' -o ' + target_md + ' -i ' + img_path -os.system(cmd) - -# 读取target_md内容 -with open(target_md, 'r', encoding='utf-8') as file: - # 按行读取 - for i, line in enumerate(file.readlines()): - # 过滤掉所有图片,即以![]开头的行 - if not line.startswith('![') and not line.startswith(' - # 2、 - # 3、 - line = re.sub(r'|', '', line) - line = line.replace("__", "") - line = line.replace("---", "") - line = line.replace(" ", "") - line = line.replace("\\", "") - if line.strip(): - print(f"{line.strip()}")