diff --git a/dsRag/Test/T12.py b/dsRag/Test/T12.py index d5642a84..736853c2 100644 --- a/dsRag/Test/T12.py +++ b/dsRag/Test/T12.py @@ -2,44 +2,26 @@ pip install faiss-cpu """ import re -import faiss -from transformers import AutoTokenizer, AutoModel -from transformers import LlamaForCausalLM from Util.SplitDocxUtil import SplitDocxUtil -# 加载预训练模型和分词器 -tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') -model = LlamaForCausalLM.from_pretrained("d:/Model/google-bert/bert-base-uncased", local_files_only=True) def split_into_blocks(text): """使用正则表达式匹配问题和话题的标题及内容""" pattern = r'(问题\d+|话题\d+)([\s\S]+?)(?=问题\d+|话题\d+|$)' blocks = re.findall(pattern, text, re.DOTALL) - return [(i+1, title + content) for i, (title, content) in enumerate(blocks)] + return [(i + 1, title + content) for i, (title, content) in enumerate(blocks)] -def encode_text(text): - inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length') - outputs = model(**inputs) - embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() - return embeddings - -# 创建向量数据库 -dimension = 768 # BERT模型的维度 -index = faiss.IndexFlatL2(dimension) # 源文件 input_file = '../Txt/小学数学(史校长).docx' -output_dir = '../Txt/processed_chunks' """处理文档主函数""" text = SplitDocxUtil.read_docx(input_file) -blocks=split_into_blocks(text) +blocks = split_into_blocks(text) # 将块编码并添加到向量数据库 for block in blocks: - embedding = encode_text(block) - index.add(embedding) - -# 保存向量数据库 -#faiss.write_index(index, 'math_blocks.index') \ No newline at end of file + if (len(block[1]) > 10): + print(block[1]) + print("\r\n")