main
HuangHai 1 month ago
parent fdbab5f3ae
commit 0335b8c17a

@ -2,19 +2,15 @@
# 安装向量化的包 # 断开VPN后执行安装包
conda activate rag
pip install text2vec torch torchvision torchaudio
# 安装完整版本(包含额外依赖)
pip install gensim[complete]
'''
from Util.EmbeddingUtil import *
from text2vec import SentenceModel
sentences = ['如何更换花呗绑定银行卡', '花呗更改绑定银行卡']
'''
- 自动下载预训练模型到缓存目录通常是 ~/.cache/huggingface/hub
- 后续运行会直接使用缓存
如果下载慢可以设置镜像源
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
'''
model = SentenceModel('shibing624/text2vec-base-chinese')
embeddings = model.encode(sentences)
print(embeddings)
for sentence in sentences:
x = text_to_embedding(sentence)
print(x)

@ -0,0 +1,26 @@
import logging
import jieba
from gensim.models import KeyedVectors
# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# 初始化 Word2Vec 模型
model_path = r"D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt"
model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000)
logger.info(f"模型加载成功,词向量维度: {model.vector_size}")
# 将文本转换为嵌入向量
def text_to_embedding(text):
words = jieba.lcut(text) # 使用 jieba 分词
logger.info(f"文本: {text}, 分词结果: {words}")
embeddings = [model[word] for word in words if word in model]
logger.info(f"有效词向量数量: {len(embeddings)}")
if embeddings:
avg_embedding = sum(embeddings) / len(embeddings)
logger.info(f"生成的平均向量: {avg_embedding[:5]}...") # 打印前 5 维
return avg_embedding
else:
logger.warning("未找到有效词,返回零向量")
return [0.0] * model.vector_size
Loading…
Cancel
Save