Files
dsProject/dsSchoolBuddy/ElasticSearch/T2_BgeM3.py
2025-08-19 07:45:15 +08:00

40 lines
1.1 KiB
Python

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
import os
from Config.Config import EMBED_MODEL_NAME, EMBED_BASE_URL, EMBED_API_KEY
# 设置环境变量
os.environ["OPENAI_BASE_URL"] = EMBED_BASE_URL
os.environ["OPENAI_API_KEY"] = EMBED_API_KEY
# 加载文档
file_path = "../langchain/data/0001.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()
print(f"文档页数:{len(docs)}")
# 切割文档
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, chunk_overlap=100, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)
len(all_splits)
# 嵌入模型
embeddings = OpenAIEmbeddings(model=EMBED_MODEL_NAME)
# 向量存储
vector_store = InMemoryVectorStore(embeddings)
ids = vector_store.add_documents(documents=all_splits)
# 向量查询
results = vector_store.similarity_search(
"混凝土"
)
print(results[0])