40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
|
from langchain_community.document_loaders import PyPDFLoader
|
||
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||
|
from langchain_openai import OpenAIEmbeddings
|
||
|
from langchain_core.vectorstores import InMemoryVectorStore
|
||
|
import os
|
||
|
|
||
|
from Config.Config import EMBED_MODEL_NAME, EMBED_BASE_URL, EMBED_API_KEY
|
||
|
|
||
|
# 设置环境变量
|
||
|
os.environ["OPENAI_BASE_URL"] = EMBED_BASE_URL
|
||
|
os.environ["OPENAI_API_KEY"] = EMBED_API_KEY
|
||
|
|
||
|
# 加载文档
|
||
|
file_path = "../langchain/data/0001.pdf"
|
||
|
loader = PyPDFLoader(file_path)
|
||
|
docs = loader.load()
|
||
|
print(f"文档页数:{len(docs)} 页")
|
||
|
|
||
|
# 切割文档
|
||
|
text_splitter = RecursiveCharacterTextSplitter(
|
||
|
chunk_size=500, chunk_overlap=100, add_start_index=True
|
||
|
)
|
||
|
all_splits = text_splitter.split_documents(docs)
|
||
|
|
||
|
len(all_splits)
|
||
|
|
||
|
# 嵌入模型
|
||
|
embeddings = OpenAIEmbeddings(model=EMBED_MODEL_NAME)
|
||
|
|
||
|
# 向量存储
|
||
|
vector_store = InMemoryVectorStore(embeddings)
|
||
|
ids = vector_store.add_documents(documents=all_splits)
|
||
|
|
||
|
# 向量查询
|
||
|
results = vector_store.similarity_search(
|
||
|
"混凝土"
|
||
|
)
|
||
|
|
||
|
print(results[0])
|