You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
26 lines
921 B
26 lines
921 B
import re
|
|
import faiss
|
|
from transformers import AutoTokenizer, AutoModel
|
|
# 加载预训练模型和分词器
|
|
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
|
model = AutoModel.from_pretrained('bert-base-uncased')
|
|
|
|
def encode_text(text):
|
|
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
|
|
outputs = model(**inputs)
|
|
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
|
|
return embeddings
|
|
|
|
def search_similar_blocks(query, index, k=5):
|
|
query_embedding = encode_text(query)
|
|
D, I = index.search(query_embedding, k)
|
|
return I[0]
|
|
|
|
# 示例:查询
|
|
query = "小学数学中有哪些模型"
|
|
index = faiss.read_index('math_blocks.index')
|
|
similar_indices = search_similar_blocks(query, index)
|
|
|
|
# 打印最相似的块
|
|
for idx in similar_indices:
|
|
print(f"Similar Block {idx + 1}:\n{blocks[idx]}\n") |