|
|
|
@ -2,44 +2,26 @@
|
|
|
|
|
pip install faiss-cpu
|
|
|
|
|
"""
|
|
|
|
|
import re
|
|
|
|
|
import faiss
|
|
|
|
|
from transformers import AutoTokenizer, AutoModel
|
|
|
|
|
from transformers import LlamaForCausalLM
|
|
|
|
|
from Util.SplitDocxUtil import SplitDocxUtil
|
|
|
|
|
|
|
|
|
|
# 加载预训练模型和分词器
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
|
|
|
|
model = LlamaForCausalLM.from_pretrained("d:/Model/google-bert/bert-base-uncased", local_files_only=True)
|
|
|
|
|
|
|
|
|
|
def split_into_blocks(text):
|
|
|
|
|
"""使用正则表达式匹配问题和话题的标题及内容"""
|
|
|
|
|
pattern = r'(问题\d+|话题\d+)([\s\S]+?)(?=问题\d+|话题\d+|$)'
|
|
|
|
|
blocks = re.findall(pattern, text, re.DOTALL)
|
|
|
|
|
return [(i+1, title + content) for i, (title, content) in enumerate(blocks)]
|
|
|
|
|
return [(i + 1, title + content) for i, (title, content) in enumerate(blocks)]
|
|
|
|
|
|
|
|
|
|
def encode_text(text):
|
|
|
|
|
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
|
|
|
|
|
outputs = model(**inputs)
|
|
|
|
|
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
|
|
|
|
|
return embeddings
|
|
|
|
|
|
|
|
|
|
# 创建向量数据库
|
|
|
|
|
dimension = 768 # BERT模型的维度
|
|
|
|
|
index = faiss.IndexFlatL2(dimension)
|
|
|
|
|
|
|
|
|
|
# 源文件
|
|
|
|
|
input_file = '../Txt/小学数学(史校长).docx'
|
|
|
|
|
output_dir = '../Txt/processed_chunks'
|
|
|
|
|
|
|
|
|
|
"""处理文档主函数"""
|
|
|
|
|
text = SplitDocxUtil.read_docx(input_file)
|
|
|
|
|
|
|
|
|
|
blocks=split_into_blocks(text)
|
|
|
|
|
blocks = split_into_blocks(text)
|
|
|
|
|
|
|
|
|
|
# 将块编码并添加到向量数据库
|
|
|
|
|
for block in blocks:
|
|
|
|
|
embedding = encode_text(block)
|
|
|
|
|
index.add(embedding)
|
|
|
|
|
|
|
|
|
|
# 保存向量数据库
|
|
|
|
|
#faiss.write_index(index, 'math_blocks.index')
|
|
|
|
|
if (len(block[1]) > 10):
|
|
|
|
|
print(block[1])
|
|
|
|
|
print("\r\n")
|
|
|
|
|