From 096f68ae7382e8e1f47415db96d05b9305bdb8c1 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 24 Jun 2025 15:04:22 +0800 Subject: [PATCH] 'commit' --- dsRag/Test/T9_TestReadPDF.py | 13 +++++++ dsRag/Util/PdfUtil.py | 34 ++++++++++++++++++ .../Util/__pycache__/PdfUtil.cpython-310.pyc | Bin 0 -> 892 bytes 3 files changed, 47 insertions(+) create mode 100644 dsRag/Test/T9_TestReadPDF.py create mode 100644 dsRag/Util/PdfUtil.py create mode 100644 dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc diff --git a/dsRag/Test/T9_TestReadPDF.py b/dsRag/Test/T9_TestReadPDF.py new file mode 100644 index 00000000..113eaa5b --- /dev/null +++ b/dsRag/Test/T9_TestReadPDF.py @@ -0,0 +1,13 @@ +""" +conda activate rag +pip install PyPDF2 +""" +from Util.PdfUtil import read_pdf_file + +# 使用示例 +if __name__ == "__main__": + pdf_file = r"d:/办公/【20250521】xyy需求审核(加报价)-局校一体化.pdf" + content = read_pdf_file(pdf_file) + if content: + print("\nPDF内容:") + print(content) diff --git a/dsRag/Util/PdfUtil.py b/dsRag/Util/PdfUtil.py new file mode 100644 index 00000000..bdf85000 --- /dev/null +++ b/dsRag/Util/PdfUtil.py @@ -0,0 +1,34 @@ +import PyPDF2 +import os + + +def read_pdf_file(file_path): + """ + 读取PDF文件内容 + :param file_path: PDF文件路径 + :return: 文档文本内容 + """ + try: + # 检查文件是否存在 + if not os.path.exists(file_path): + raise FileNotFoundError(f"文件 {file_path} 不存在") + + # 检查文件是否为PDF + if not file_path.lower().endswith('.pdf'): + raise ValueError("仅支持.pdf格式的文件") + + text = "" + + # 以二进制模式打开PDF文件 + with open(file_path, 'rb') as file: + reader = PyPDF2.PdfReader(file) + + # 逐页读取内容 + for page in reader.pages: + text += page.extract_text() + "\n" + + return text.strip() + + except Exception as e: + print(f"读取PDF文件时出错: {str(e)}") + return None \ No newline at end of file diff --git a/dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc b/dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..772eb236116dd9b26089c5ce2ff5da259e88a079 GIT binary patch literal 892 zcmZ8g%}*0S6n`_jTefRkkPC_&dhk*YBziGvjG73;GZy8X@!5m81+p{2=Q&+$GrD@Z{K^9apQ3Zkjl3*Gb;wb z4}CaE6c5*t{XP^77%4!etx1M3%8W(GP)5ceWga2|hI2%#BZ{Eb^=}~i-%%{wlrT$b zG=w_wJEg%$2phoYmQf{7(XN?ws9DUwXs?VgQ+t_JA+jR`7OB$Ja}_G>S~R3}QZYhf z))|0%PQ?sKXx2{Efpzr2WvGyc7si3vCYVq)2o&v$;9n~d0{Xj)9FkhB4&yL}2pKOk z0u_tELYGXisO$`_D%pT7TE&cU#522MJ+hcRc4U=E2~tkL^>j(3gYDhM>iWpQP;-5G ze|M+xY^m|_ODmH0vLZW|oXQvYq?eU5>Eto$;L~>F+hQ9fxC}%woy7cR{f(BJ?@v?- zI`B{J+Q~rgGr9lydE>*&#^&1vv)^M=fxeyn-KFN*c5~&4_B3mIjlI>MuNT{C3KSLh z6>)}xUWESFaP!4ZWBKdh+N*Rj)upKGD^oYCB78pYOJAKH!v2R{IphXKHYkKElvQv` zTqv6tnP18y*13}{1iT$IU5^)4WTcG4UdlvA*wiS`GA=Tfm!0N*CdTKb$mZmvL_=A= z6nRfY2j_F#lXNHn`zXQ>Oqy0(^q$ij^ z?urMfMua=da}ur5?DWKKnJ-LWH(K|5Wfj*YCUIMnx}t^I{&^HM5~E!tVc5i^F~cE_ z(M{;zzTZwqf_9Sxu^-bv;w%DFhKQpQIY|`1(oMV3U~f E5AP-ry8r+H literal 0 HcmV?d00001