You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

26 lines
921 B

import re
import faiss
from transformers import AutoTokenizer, AutoModel
# 加载预训练模型和分词器
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')
def encode_text(text):
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
return embeddings
def search_similar_blocks(query, index, k=5):
query_embedding = encode_text(query)
D, I = index.search(query_embedding, k)
return I[0]
# 示例:查询
query = "小学数学中有哪些模型"
index = faiss.read_index('math_blocks.index')
similar_indices = search_similar_blocks(query, index)
# 打印最相似的块
for idx in similar_indices:
print(f"Similar Block {idx + 1}:\n{blocks[idx]}\n")