main
HuangHai 4 weeks ago
parent 3a41a4e6c1
commit 6441d596cf

@ -0,0 +1,47 @@
"""
pip install faiss-cpu
"""
import re
import faiss
from transformers import AutoTokenizer, AutoModel
from transformers import LlamaForCausalLM
from Util.SplitDocxUtil import SplitDocxUtil
# 加载预训练模型和分词器
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
#model = AutoModel.from_pretrained('bert-base-uncased')
model = LlamaForCausalLM.from_pretrained("./path/to/local/directory", local_files_only=True)
def split_into_blocks(text):
"""使用正则表达式匹配问题和话题的标题及内容"""
pattern = r'(问题\d+|话题\d+)([\s\S]+?)(?=问题\d+|话题\d+|$)'
blocks = re.findall(pattern, text, re.DOTALL)
return [(i+1, title + content) for i, (title, content) in enumerate(blocks)]
def encode_text(text):
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
return embeddings
# 创建向量数据库
dimension = 768 # BERT模型的维度
index = faiss.IndexFlatL2(dimension)
# 源文件
input_file = '../Txt/小学数学(史校长).docx'
output_dir = '../Txt/processed_chunks'
"""处理文档主函数"""
text = SplitDocxUtil.read_docx(input_file)
blocks=split_into_blocks(text)
# 将块编码并添加到向量数据库
for block in blocks:
embedding = encode_text(block)
index.add(embedding)
# 保存向量数据库
#faiss.write_index(index, 'math_blocks.index')

@ -0,0 +1,26 @@
import re
import faiss
from transformers import AutoTokenizer, AutoModel
# 加载预训练模型和分词器
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')
def encode_text(text):
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
return embeddings
def search_similar_blocks(query, index, k=5):
query_embedding = encode_text(query)
D, I = index.search(query_embedding, k)
return I[0]
# 示例:查询
query = "小学数学中有哪些模型"
index = faiss.read_index('math_blocks.index')
similar_indices = search_similar_blocks(query, index)
# 打印最相似的块
for idx in similar_indices:
print(f"Similar Block {idx + 1}:\n{blocks[idx]}\n")

@ -0,0 +1,27 @@
"""
pip install huggingface_hub
pip install pysocks
pip install hf_xet
开VPN后,使用Python下载模型
"""
import os
from transformers import AutoModel, AutoTokenizer
# 设置环境变量
os.environ['HTTP_PROXY'] = 'socks5://127.0.0.1:1080'
os.environ['HTTPS_PROXY'] = 'socks5://127.0.0.1:1080'
# 配置代理
proxies = {
'http': 'socks5://127.0.0.1:1080',
'https': 'socks5://127.0.0.1:1080'
}
# 加载模型
model_id = "google-bert/bert-base-uncased"
model = AutoModel.from_pretrained(model_id, proxies=proxies)
tokenizer = AutoTokenizer.from_pretrained(model_id, proxies=proxies)
# 保存模型到本地
model.save_pretrained("d:/Model/google-bert/bert-base-uncased")
tokenizer.save_pretrained("d:/Model/google-bert/bert-base-uncased")

@ -100,9 +100,11 @@ def search_related_data(query):
context = ""
for hit in vector_results['hits']['hits']:
context += f"向量相似度结果(score={hit['_score']}):\n{hit['_source']['text']}\n\n"
print(context)
for hit in text_results['hits']['hits']:
context += f"文本精确匹配结果(score={hit['_score']}):\n{hit['_source']['text']}\n\n"
print(context)
return context

Loading…
Cancel
Save