diff --git a/dsRag/Test/T9_TestReadPDF.py b/dsRag/Test/T9_TestReadPDF.py new file mode 100644 index 00000000..113eaa5b --- /dev/null +++ b/dsRag/Test/T9_TestReadPDF.py @@ -0,0 +1,13 @@ +""" +conda activate rag +pip install PyPDF2 +""" +from Util.PdfUtil import read_pdf_file + +# 使用示例 +if __name__ == "__main__": + pdf_file = r"d:/办公/【20250521】xyy需求审核(加报价)-局校一体化.pdf" + content = read_pdf_file(pdf_file) + if content: + print("\nPDF内容:") + print(content) diff --git a/dsRag/Util/PdfUtil.py b/dsRag/Util/PdfUtil.py new file mode 100644 index 00000000..bdf85000 --- /dev/null +++ b/dsRag/Util/PdfUtil.py @@ -0,0 +1,34 @@ +import PyPDF2 +import os + + +def read_pdf_file(file_path): + """ + 读取PDF文件内容 + :param file_path: PDF文件路径 + :return: 文档文本内容 + """ + try: + # 检查文件是否存在 + if not os.path.exists(file_path): + raise FileNotFoundError(f"文件 {file_path} 不存在") + + # 检查文件是否为PDF + if not file_path.lower().endswith('.pdf'): + raise ValueError("仅支持.pdf格式的文件") + + text = "" + + # 以二进制模式打开PDF文件 + with open(file_path, 'rb') as file: + reader = PyPDF2.PdfReader(file) + + # 逐页读取内容 + for page in reader.pages: + text += page.extract_text() + "\n" + + return text.strip() + + except Exception as e: + print(f"读取PDF文件时出错: {str(e)}") + return None \ No newline at end of file diff --git a/dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc b/dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc new file mode 100644 index 00000000..772eb236 Binary files /dev/null and b/dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc differ