You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

58 lines
2.4 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from WxMini.Milvus.Config.MulvusConfig import *
from WxMini.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager
from WxMini.Milvus.Utils.MilvusConnectionPool import *
from gensim.models import KeyedVectors
import jieba
# 加载预训练的 Word2Vec 模型
model_path = "D:/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt" # 替换为你的 Word2Vec 模型路径
model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000)
print(f"模型加载成功,词向量维度: {model.vector_size}")
# 将文本转换为嵌入向量
def text_to_embedding(text):
words = jieba.lcut(text) # 使用 jieba 分词
print(f"文本: {text}, 分词结果: {words}")
embeddings = [model[word] for word in words if word in model]
print(f"有效词向量数量: {len(embeddings)}")
if embeddings:
avg_embedding = sum(embeddings) / len(embeddings)
print(f"生成的平均向量: {avg_embedding[:5]}...") # 打印前 5 维
return avg_embedding
else:
print("未找到有效词,返回零向量")
return [0.0] * model.vector_size
# 1. 使用连接池管理 Milvus 连接
milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)
# 2. 从连接池中获取一个连接
connection = milvus_pool.get_connection()
# 3. 初始化集合管理器
collection_name = MS_COLLECTION_NAME
collection_manager = MilvusCollectionManager(collection_name)
# 4. 插入数据
texts = [
"我今天心情不太好,因为工作压力很大。", # 第一个对话文本
"我最近在学习 Python感觉很有趣。", # 第二个对话文本
"我打算周末去爬山,放松一下。", # 第三个对话文本
"吉林省广告产业园是东师理想的办公地点。" # 第四个对话文本
]
embeddings = [text_to_embedding(text) for text in texts] # 使用文本模型生成向量
# 打印生成的向量值
for text, embedding in zip(texts, embeddings):
print(f"文本: {text}, 向量: {embedding[:5]}...") # 打印前 5 维
# 5. 插入数据,确保字段顺序与集合定义一致
entities = [texts, embeddings] # 第一个列表是 text 字段,第二个列表是 embedding 字段
collection_manager.insert_data(entities)
print("数据插入成功。")
# 6. 释放连接
milvus_pool.release_connection(connection)
# 7. 关闭连接池
milvus_pool.close()