""" pip install faiss-cpu """ import re import faiss from transformers import AutoTokenizer, AutoModel from transformers import LlamaForCausalLM from Util.SplitDocxUtil import SplitDocxUtil # 加载预训练模型和分词器 tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #model = AutoModel.from_pretrained('bert-base-uncased') model = LlamaForCausalLM.from_pretrained("./path/to/local/directory", local_files_only=True) def split_into_blocks(text): """使用正则表达式匹配问题和话题的标题及内容""" pattern = r'(问题\d+|话题\d+)([\s\S]+?)(?=问题\d+|话题\d+|$)' blocks = re.findall(pattern, text, re.DOTALL) return [(i+1, title + content) for i, (title, content) in enumerate(blocks)] def encode_text(text): inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length') outputs = model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() return embeddings # 创建向量数据库 dimension = 768 # BERT模型的维度 index = faiss.IndexFlatL2(dimension) # 源文件 input_file = '../Txt/小学数学(史校长).docx' output_dir = '../Txt/processed_chunks' """处理文档主函数""" text = SplitDocxUtil.read_docx(input_file) blocks=split_into_blocks(text) # 将块编码并添加到向量数据库 for block in blocks: embedding = encode_text(block) index.add(embedding) # 保存向量数据库 #faiss.write_index(index, 'math_blocks.index')