diff --git a/dsRag/Test/T9_TestReadPDF.py b/dsRag/Test/T9_TestReadPDF.py index fec57f58..9f08f4e3 100644 --- a/dsRag/Test/T9_TestReadPDF.py +++ b/dsRag/Test/T9_TestReadPDF.py @@ -2,7 +2,6 @@ conda activate rag pip install PyPDF2 -https://www.jianshu.com/p/d893d3dfd65a https://www.jianshu.com/p/d893d3dfd65a """ from Util.PdfUtil import read_pdf_file diff --git a/dsRag/Test/T9_TestReadPptx.py b/dsRag/Test/T9_TestReadPptx.py new file mode 100644 index 00000000..1595223b --- /dev/null +++ b/dsRag/Test/T9_TestReadPptx.py @@ -0,0 +1,33 @@ +""" +pip install python-pptx +""" +from pptx import Presentation +import os + + +def extract_text_from_pptx(file_path): + """从pptx文件中提取所有文本内容""" + prs = Presentation(file_path) + text_content = [] + + # 遍历所有幻灯片 + for slide in prs.slides: + # 遍历幻灯片中的所有形状 + for shape in slide.shapes: + if hasattr(shape, "text"): + text = shape.text.strip() + if text: # 只添加非空文本 + text_content.append(text) + + return '\n'.join(text_content) + + +if __name__ == "__main__": + # 示例用法 + pptx_file = "../Txt/东师理想智慧教学管理应用介绍.pptx" # 替换为实际文件路径 + if os.path.exists(pptx_file): + text = extract_text_from_pptx(pptx_file) + print("提取的文本内容:") + print(text) + else: + print(f"文件 {pptx_file} 不存在") \ No newline at end of file diff --git a/dsRag/Txt/东师理想智慧教学管理应用介绍.pptx b/dsRag/Txt/东师理想智慧教学管理应用介绍.pptx new file mode 100644 index 00000000..325bff0a Binary files /dev/null and b/dsRag/Txt/东师理想智慧教学管理应用介绍.pptx differ diff --git a/dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc b/dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc index 772eb236..d539d234 100644 Binary files a/dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc and b/dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc differ diff --git a/dsRag/Util/__pycache__/__init__.cpython-310.pyc b/dsRag/Util/__pycache__/__init__.cpython-310.pyc index 79767003..2089b6b5 100644 Binary files a/dsRag/Util/__pycache__/__init__.cpython-310.pyc and b/dsRag/Util/__pycache__/__init__.cpython-310.pyc differ