'commit'

3 weeks ago · 695838b85f
parent ef951ba8b8
commit 695838b85f
3 changed files with 33 additions and 0 deletions
--- a/dsRag/Test/TestReadPptx.py
+++ b/dsRag/Test/TestReadPptx.py
@ -0,0 +1,33 @@
+# conda activate rag
+# pip install pptx2md
+
+import os
+import re
+
+# 源pptx文件
+source_pptx = r'D:\dsWork\dsProject\dsRag\Test\楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx'
+# 目标md文件
+target_md = r'C:\logs\output.md'
+# 图片保存路径
+img_path = r'C:\logs\img'
+# 运行命令
+cmd = r'pptx2md ' + source_pptx + ' -o ' + target_md + ' -i ' + img_path
+os.system(cmd)
+
+# 读取target_md内容
+with open(target_md, 'r', encoding='utf-8') as file:
+    # 按行读取
+    for i, line in enumerate(file.readlines()):
+        # 过滤掉所有图片，即以![]开头的行
+        if not line.startswith('![') and not line.startswith('<img'):
+            # 通过正则去掉这样的内容，注意不是指整体去除，而是中间的文字需要保留
+            # 1、 <span style="color:#ffff00">
+            # 2、 <span style="color:#000000">
+            # 3、 </span>
+            line = re.sub(r'<span style="color:#[0-9a-fA-F]{6}">|</span>', '', line)
+            line = line.replace("__", "")
+            line = line.replace("---", "")
+            line = line.replace(" ", "")
+            line = line.replace("\\", "")
+            if line.strip():
+                print(f"{i + 1}: {line.strip()}")
--- a/dsRag/Test/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx
+++ b/dsRag/Test/楚雄州教育信息化2.0建设项目--智慧教学支撑系统介绍.pptx
--- a/dsRag/requirements.txt
+++ b/dsRag/requirements.txt