From b4535e3faa17b5d0f763cfea840be73b1c92fe6f Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 24 Mar 2025 10:27:46 +0800 Subject: [PATCH] 'commit' --- AI/WxMini/Milvus/T5_search_near_word.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/AI/WxMini/Milvus/T5_search_near_word.py b/AI/WxMini/Milvus/T5_search_near_word.py index 0f129b4b..9a23e515 100644 --- a/AI/WxMini/Milvus/T5_search_near_word.py +++ b/AI/WxMini/Milvus/T5_search_near_word.py @@ -1,24 +1,29 @@ -# pip install gensim +# pip install gensim jieba import time +import jieba # 导入 jieba 分词库 from WxMini.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager from WxMini.Milvus.Utils.MilvusConnectionPool import * from WxMini.Milvus.Config.MulvusConfig import * from gensim.models import KeyedVectors + # 加载预训练的 Word2Vec 模型 model_path = "D:/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt" # 替换为你的 Word2Vec 模型路径 -# 参考文档:使用gensim之KeyedVectors操作词向量模型 -# https://www.cnblogs.com/bill-h/p/14655224.html -# 读取词向量模型(限定前10000个词) model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000) +print(f"模型加载成功,词向量维度: {model.vector_size}") # 将文本转换为嵌入向量 def text_to_embedding(text): - words = text.split() + words = jieba.lcut(text) # 使用 jieba 分词 + print(f"文本: {text}, 分词结果: {words}") embeddings = [model[word] for word in words if word in model] + print(f"有效词向量数量: {len(embeddings)}") if embeddings: - return sum(embeddings) / len(embeddings) # 取词向量的平均值 + avg_embedding = sum(embeddings) / len(embeddings) + print(f"生成的平均向量: {avg_embedding[:5]}...") # 打印前 5 维 + return avg_embedding else: - return [0.0] * model.vector_size # 如果文本中没有有效词,返回零向量 + print("未找到有效词,返回零向量") + return [0.0] * model.vector_size # 1. 使用连接池管理 Milvus 连接 milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS) @@ -45,7 +50,7 @@ search_params = { "params": {"nprobe": 100} # 设置 IVF_FLAT 的 nprobe 参数 } start_time = time.time() -results = collection_manager.search(current_embedding, search_params, limit=5) # 返回 5 条结果 +results = collection_manager.search(current_embedding, search_params, limit=2) # 返回 2 条结果 end_time = time.time() # 8. 输出查询结果