'commit'

1 month ago · 096f68ae73
parent 28f87c228a
commit 096f68ae73
3 changed files with 47 additions and 0 deletions
--- a/dsRag/Test/T9_TestReadPDF.py
+++ b/dsRag/Test/T9_TestReadPDF.py
@ -0,0 +1,13 @@
 """
 conda activate rag
 pip install PyPDF2
 """
 from Util.PdfUtil import read_pdf_file
 # 使用示例
 if __name__ == "__main__":
    pdf_file = r"d:/办公/【20250521】xyy需求审核（加报价）-局校一体化.pdf"
    content = read_pdf_file(pdf_file)
    if content:
        print("\nPDF内容:")
        print(content)
--- a/dsRag/Util/PdfUtil.py
+++ b/dsRag/Util/PdfUtil.py
@ -0,0 +1,34 @@
 import PyPDF2
 import os
 def read_pdf_file(file_path):
    """
    读取PDF文件内容
    :param file_path: PDF文件路径
    :return: 文档文本内容
    """
    try:
        # 检查文件是否存在
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"文件 {file_path} 不存在")
        # 检查文件是否为PDF
        if not file_path.lower().endswith('.pdf'):
            raise ValueError("仅支持.pdf格式的文件")
        text = ""
        # 以二进制模式打开PDF文件
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            # 逐页读取内容
            for page in reader.pages:
                text += page.extract_text() + "\n"
        return text.strip()
    except Exception as e:
        print(f"读取PDF文件时出错: {str(e)}")
        return None
--- a/dsRag/Util/pycache/PdfUtil.cpython-310.pyc
+++ b/dsRag/Util/pycache/PdfUtil.cpython-310.pyc