parent
a3e3031456
commit
ebbb6e99ad
@ -0,0 +1,9 @@
|
|||||||
|
# 基础路径配置
|
||||||
|
basePath = r'D:\dsWork\dsProject\dsRag'
|
||||||
|
|
||||||
|
# 时间格式
|
||||||
|
time_format = '%Y-%m-%d %H:%M:%S'
|
||||||
|
|
||||||
|
# OCR结果输出配置
|
||||||
|
ocr_output_dir = f"{basePath}\\KeCheng\\Txt"
|
||||||
|
markdown_output_dir = f"{basePath}\\KeCheng\\Txt"
|
@ -0,0 +1,34 @@
|
|||||||
|
# conda activate rag
|
||||||
|
# pip install pptx2md
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
# 源pptx文件
|
||||||
|
source_pptx = r'D:\dsWork\dsProject\dsRag\Pptx\楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx'
|
||||||
|
# 目标md文件
|
||||||
|
target_md = r'C:\logs\output.md'
|
||||||
|
# 图片保存路径
|
||||||
|
img_path = r'C:\logs\img'
|
||||||
|
# 运行命令
|
||||||
|
cmd = r'pptx2md ' + source_pptx + ' -o ' + target_md + ' -i ' + img_path
|
||||||
|
os.system(cmd)
|
||||||
|
|
||||||
|
# 读取target_md内容
|
||||||
|
with open(target_md, 'r', encoding='utf-8') as file:
|
||||||
|
# 按行读取
|
||||||
|
for i, line in enumerate(file.readlines()):
|
||||||
|
# 过滤掉所有图片,即以![]开头的行
|
||||||
|
if line.startswith('![') :
|
||||||
|
continue
|
||||||
|
# 通过正则去掉这样的内容,注意不是指整体去除,而是中间的文字需要保留
|
||||||
|
# 1、 <span style="color:#ffff00">
|
||||||
|
# 2、 <span style="color:#000000">
|
||||||
|
# 3、 </span>
|
||||||
|
line = re.sub(r'<span style="color:#[0-9a-fA-F]{6}">|</span>', '', line)
|
||||||
|
line = line.replace("__", "")
|
||||||
|
line = line.replace("---", "")
|
||||||
|
line = line.replace(" ", "")
|
||||||
|
line = line.replace("\\", "")
|
||||||
|
if line.strip():
|
||||||
|
print(f"{line.strip()}")
|
Binary file not shown.
Binary file not shown.
@ -0,0 +1,7 @@
|
|||||||
|
https://github.com/PaddlePaddle/PaddleOCR
|
||||||
|
|
||||||
|
【2023.4.29】OCR识别__百度飞桨PaddleOCR测试及环境搭建详解
|
||||||
|
https://www.bilibili.com/video/BV1w14y1Z7bD/?vd_source=13b33731bb79a73783e9f2c0e11857ae
|
||||||
|
|
||||||
|
【安装文档】
|
||||||
|
https://github.com/PaddlePaddle/PaddleOCR/blob/main/docs/quick_start.md
|
@ -1,33 +0,0 @@
|
|||||||
# conda activate rag
|
|
||||||
# pip install pptx2md
|
|
||||||
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
|
|
||||||
# 源pptx文件
|
|
||||||
source_pptx = r'D:\dsWork\dsProject\dsRag\Test\楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx'
|
|
||||||
# 目标md文件
|
|
||||||
target_md = r'C:\logs\output.md'
|
|
||||||
# 图片保存路径
|
|
||||||
img_path = r'C:\logs\img'
|
|
||||||
# 运行命令
|
|
||||||
cmd = r'pptx2md ' + source_pptx + ' -o ' + target_md + ' -i ' + img_path
|
|
||||||
os.system(cmd)
|
|
||||||
|
|
||||||
# 读取target_md内容
|
|
||||||
with open(target_md, 'r', encoding='utf-8') as file:
|
|
||||||
# 按行读取
|
|
||||||
for i, line in enumerate(file.readlines()):
|
|
||||||
# 过滤掉所有图片,即以![]开头的行
|
|
||||||
if not line.startswith('![') and not line.startswith('<img'):
|
|
||||||
# 通过正则去掉这样的内容,注意不是指整体去除,而是中间的文字需要保留
|
|
||||||
# 1、 <span style="color:#ffff00">
|
|
||||||
# 2、 <span style="color:#000000">
|
|
||||||
# 3、 </span>
|
|
||||||
line = re.sub(r'<span style="color:#[0-9a-fA-F]{6}">|</span>', '', line)
|
|
||||||
line = line.replace("__", "")
|
|
||||||
line = line.replace("---", "")
|
|
||||||
line = line.replace(" ", "")
|
|
||||||
line = line.replace("\\", "")
|
|
||||||
if line.strip():
|
|
||||||
print(f"{line.strip()}")
|
|
Loading…
Reference in new issue