'commit'

4 months ago · b4535e3faa
parent d1adc7dff1
commit b4535e3faa
1 changed files with 13 additions and 8 deletions
--- a/AI/WxMini/Milvus/T5_search_near_word.py
+++ b/AI/WxMini/Milvus/T5_search_near_word.py
@ -1,24 +1,29 @@
-# pip install gensim
+# pip install gensim jieba
 import time
+import jieba  # 导入 jieba 分词库
 from WxMini.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager
 from WxMini.Milvus.Utils.MilvusConnectionPool import *
 from WxMini.Milvus.Config.MulvusConfig import *
 from gensim.models import KeyedVectors
+
 # 加载预训练的 Word2Vec 模型
 model_path = "D:/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt"  # 替换为你的 Word2Vec 模型路径
-# 参考文档：使用gensim之KeyedVectors操作词向量模型
-# https://www.cnblogs.com/bill-h/p/14655224.html
-# 读取词向量模型（限定前10000个词）
 model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000)
+print(f"模型加载成功，词向量维度: {model.vector_size}")

 # 将文本转换为嵌入向量
 def text_to_embedding(text):
-    words = text.split()
+    words = jieba.lcut(text)  # 使用 jieba 分词
+    print(f"文本: {text}, 分词结果: {words}")
    embeddings = [model[word] for word in words if word in model]
+    print(f"有效词向量数量: {len(embeddings)}")
    if embeddings:
-        return sum(embeddings) / len(embeddings)  # 取词向量的平均值
+        avg_embedding = sum(embeddings) / len(embeddings)
+        print(f"生成的平均向量: {avg_embedding[:5]}...")  # 打印前 5 维
+        return avg_embedding
    else:
-        return [0.0] * model.vector_size  # 如果文本中没有有效词，返回零向量
+        print("未找到有效词，返回零向量")
+        return [0.0] * model.vector_size

 # 1. 使用连接池管理 Milvus 连接
 milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)
@ -45,7 +50,7 @@ search_params = {
    "params": {"nprobe": 100}  # 设置 IVF_FLAT 的 nprobe 参数
 }
 start_time = time.time()
-results = collection_manager.search(current_embedding, search_params, limit=5)  # 返回 5 条结果
+results = collection_manager.search(current_embedding, search_params, limit=2)  # 返回 2 条结果
 end_time = time.time()

 # 8. 输出查询结果