diff --git a/dsRag/Test/TestReadPptx.py b/dsRag/Test/TestReadPptx.py new file mode 100644 index 00000000..902c9120 --- /dev/null +++ b/dsRag/Test/TestReadPptx.py @@ -0,0 +1,33 @@ +# conda activate rag +# pip install pptx2md + +import os +import re + +# 源pptx文件 +source_pptx = r'D:\dsWork\dsProject\dsRag\Test\楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx' +# 目标md文件 +target_md = r'C:\logs\output.md' +# 图片保存路径 +img_path = r'C:\logs\img' +# 运行命令 +cmd = r'pptx2md ' + source_pptx + ' -o ' + target_md + ' -i ' + img_path +os.system(cmd) + +# 读取target_md内容 +with open(target_md, 'r', encoding='utf-8') as file: + # 按行读取 + for i, line in enumerate(file.readlines()): + # 过滤掉所有图片,即以![]开头的行 + if not line.startswith('![') and not line.startswith(' + # 2、 + # 3、 + line = re.sub(r'|', '', line) + line = line.replace("__", "") + line = line.replace("---", "") + line = line.replace(" ", "") + line = line.replace("\\", "") + if line.strip(): + print(f"{i + 1}: {line.strip()}") diff --git a/dsRag/Test/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx b/dsRag/Test/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx new file mode 100644 index 00000000..47f40b52 Binary files /dev/null and b/dsRag/Test/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx differ diff --git a/dsRag/requirements.txt b/dsRag/requirements.txt index bd13be17..0f4ba0d4 100644 Binary files a/dsRag/requirements.txt and b/dsRag/requirements.txt differ