import re import faiss from transformers import AutoTokenizer, AutoModel # 加载预训练模型和分词器 tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') model = AutoModel.from_pretrained('bert-base-uncased') def encode_text(text): inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length') outputs = model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() return embeddings def search_similar_blocks(query, index, k=5): query_embedding = encode_text(query) D, I = index.search(query_embedding, k) return I[0] # 示例:查询 query = "小学数学中有哪些模型" index = faiss.read_index('math_blocks.index') similar_indices = search_similar_blocks(query, index) # 打印最相似的块 for idx in similar_indices: print(f"Similar Block {idx + 1}:\n{blocks[idx]}\n")