parent
3a41a4e6c1
commit
6441d596cf
Binary file not shown.
Binary file not shown.
@ -0,0 +1,47 @@
|
||||
"""
|
||||
pip install faiss-cpu
|
||||
"""
|
||||
import re
|
||||
import faiss
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
from transformers import LlamaForCausalLM
|
||||
from Util.SplitDocxUtil import SplitDocxUtil
|
||||
|
||||
# 加载预训练模型和分词器
|
||||
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
||||
#model = AutoModel.from_pretrained('bert-base-uncased')
|
||||
|
||||
model = LlamaForCausalLM.from_pretrained("./path/to/local/directory", local_files_only=True)
|
||||
|
||||
def split_into_blocks(text):
|
||||
"""使用正则表达式匹配问题和话题的标题及内容"""
|
||||
pattern = r'(问题\d+|话题\d+)([\s\S]+?)(?=问题\d+|话题\d+|$)'
|
||||
blocks = re.findall(pattern, text, re.DOTALL)
|
||||
return [(i+1, title + content) for i, (title, content) in enumerate(blocks)]
|
||||
|
||||
def encode_text(text):
|
||||
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
|
||||
outputs = model(**inputs)
|
||||
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
|
||||
return embeddings
|
||||
|
||||
# 创建向量数据库
|
||||
dimension = 768 # BERT模型的维度
|
||||
index = faiss.IndexFlatL2(dimension)
|
||||
|
||||
# 源文件
|
||||
input_file = '../Txt/小学数学(史校长).docx'
|
||||
output_dir = '../Txt/processed_chunks'
|
||||
|
||||
"""处理文档主函数"""
|
||||
text = SplitDocxUtil.read_docx(input_file)
|
||||
|
||||
blocks=split_into_blocks(text)
|
||||
|
||||
# 将块编码并添加到向量数据库
|
||||
for block in blocks:
|
||||
embedding = encode_text(block)
|
||||
index.add(embedding)
|
||||
|
||||
# 保存向量数据库
|
||||
#faiss.write_index(index, 'math_blocks.index')
|
@ -0,0 +1,26 @@
|
||||
import re
|
||||
import faiss
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
# 加载预训练模型和分词器
|
||||
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = AutoModel.from_pretrained('bert-base-uncased')
|
||||
|
||||
def encode_text(text):
|
||||
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
|
||||
outputs = model(**inputs)
|
||||
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
|
||||
return embeddings
|
||||
|
||||
def search_similar_blocks(query, index, k=5):
|
||||
query_embedding = encode_text(query)
|
||||
D, I = index.search(query_embedding, k)
|
||||
return I[0]
|
||||
|
||||
# 示例:查询
|
||||
query = "小学数学中有哪些模型"
|
||||
index = faiss.read_index('math_blocks.index')
|
||||
similar_indices = search_similar_blocks(query, index)
|
||||
|
||||
# 打印最相似的块
|
||||
for idx in similar_indices:
|
||||
print(f"Similar Block {idx + 1}:\n{blocks[idx]}\n")
|
@ -0,0 +1,27 @@
|
||||
"""
|
||||
pip install huggingface_hub
|
||||
pip install pysocks
|
||||
pip install hf_xet
|
||||
开VPN后,使用Python下载模型
|
||||
"""
|
||||
import os
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
|
||||
# 设置环境变量
|
||||
os.environ['HTTP_PROXY'] = 'socks5://127.0.0.1:1080'
|
||||
os.environ['HTTPS_PROXY'] = 'socks5://127.0.0.1:1080'
|
||||
|
||||
# 配置代理
|
||||
proxies = {
|
||||
'http': 'socks5://127.0.0.1:1080',
|
||||
'https': 'socks5://127.0.0.1:1080'
|
||||
}
|
||||
|
||||
# 加载模型
|
||||
model_id = "google-bert/bert-base-uncased"
|
||||
model = AutoModel.from_pretrained(model_id, proxies=proxies)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, proxies=proxies)
|
||||
|
||||
# 保存模型到本地
|
||||
model.save_pretrained("d:/Model/google-bert/bert-base-uncased")
|
||||
tokenizer.save_pretrained("d:/Model/google-bert/bert-base-uncased")
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue