from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_core.vectorstores import InMemoryVectorStore import os from Config.Config import EMBED_MODEL_NAME, EMBED_BASE_URL, EMBED_API_KEY # 设置环境变量 os.environ["OPENAI_BASE_URL"] = EMBED_BASE_URL os.environ["OPENAI_API_KEY"] = EMBED_API_KEY # 加载文档 file_path = "../langchain/data/0001.pdf" loader = PyPDFLoader(file_path) docs = loader.load() print(f"文档页数:{len(docs)} 页") # 切割文档 text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=100, add_start_index=True ) all_splits = text_splitter.split_documents(docs) len(all_splits) # 嵌入模型 embeddings = OpenAIEmbeddings(model=EMBED_MODEL_NAME) # 向量存储 vector_store = InMemoryVectorStore(embeddings) ids = vector_store.add_documents(documents=all_splits) # 向量查询 results = vector_store.similarity_search( "混凝土" ) print(results[0])