'commit'

4 weeks ago · bc89a6cd6f
parent 1a8280a328
commit bc89a6cd6f
5 changed files with 33 additions and 1 deletions
--- a/dsRag/Test/T9_TestReadPDF.py
+++ b/dsRag/Test/T9_TestReadPDF.py
@ -2,7 +2,6 @@
 conda activate rag
 pip install PyPDF2

-https://www.jianshu.com/p/d893d3dfd65a
 https://www.jianshu.com/p/d893d3dfd65a
 """
 from Util.PdfUtil import read_pdf_file
--- a/dsRag/Test/T9_TestReadPptx.py
+++ b/dsRag/Test/T9_TestReadPptx.py
@ -0,0 +1,33 @@
+"""
+pip install python-pptx
+"""
+from pptx import Presentation
+import os
+
+
+def extract_text_from_pptx(file_path):
+    """从pptx文件中提取所有文本内容"""
+    prs = Presentation(file_path)
+    text_content = []
+
+    # 遍历所有幻灯片
+    for slide in prs.slides:
+        # 遍历幻灯片中的所有形状
+        for shape in slide.shapes:
+            if hasattr(shape, "text"):
+                text = shape.text.strip()
+                if text:  # 只添加非空文本
+                    text_content.append(text)
+
+    return '\n'.join(text_content)
+
+
+if __name__ == "__main__":
+    # 示例用法
+    pptx_file = "../Txt/东师理想智慧教学管理应用介绍.pptx"  # 替换为实际文件路径
+    if os.path.exists(pptx_file):
+        text = extract_text_from_pptx(pptx_file)
+        print("提取的文本内容：")
+        print(text)
+    else:
+        print(f"文件 {pptx_file} 不存在")
--- a/dsRag/Txt/东师理想智慧教学管理应用介绍.pptx
+++ b/dsRag/Txt/东师理想智慧教学管理应用介绍.pptx
--- a/dsRag/Util/pycache/PdfUtil.cpython-310.pyc
+++ b/dsRag/Util/pycache/PdfUtil.cpython-310.pyc
--- a/dsRag/Util/pycache/init.cpython-310.pyc
+++ b/dsRag/Util/pycache/init.cpython-310.pyc