From d1adc7dff1494f532dc58aaa1058b1c3ca084ae9 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 24 Mar 2025 10:25:55 +0800 Subject: [PATCH] 'commit' --- AI/WxMini/Milvus/T3_insert_data.py | 17 ++++++++++-- AI/WxMini/Milvus/T4_select_all_data.py | 38 ++++++++++++++------------ 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/AI/WxMini/Milvus/T3_insert_data.py b/AI/WxMini/Milvus/T3_insert_data.py index 16b844d9..ad181a6a 100644 --- a/AI/WxMini/Milvus/T3_insert_data.py +++ b/AI/WxMini/Milvus/T3_insert_data.py @@ -2,19 +2,26 @@ from WxMini.Milvus.Config.MulvusConfig import * from WxMini.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager from WxMini.Milvus.Utils.MilvusConnectionPool import * from gensim.models import KeyedVectors +import jieba # 导入 jieba 分词库 # 加载预训练的 Word2Vec 模型 model_path = "D:/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt" # 替换为你的 Word2Vec 模型路径 model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000) +print(f"模型加载成功,词向量维度: {model.vector_size}") # 将文本转换为嵌入向量 def text_to_embedding(text): - words = text.split() + words = jieba.lcut(text) # 使用 jieba 分词 + print(f"文本: {text}, 分词结果: {words}") embeddings = [model[word] for word in words if word in model] + print(f"有效词向量数量: {len(embeddings)}") if embeddings: - return sum(embeddings) / len(embeddings) # 取词向量的平均值 + avg_embedding = sum(embeddings) / len(embeddings) + print(f"生成的平均向量: {avg_embedding[:5]}...") # 打印前 5 维 + return avg_embedding else: - return [0.0] * model.vector_size # 如果文本中没有有效词,返回零向量 + print("未找到有效词,返回零向量") + return [0.0] * model.vector_size # 1. 使用连接池管理 Milvus 连接 milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS) @@ -35,6 +42,10 @@ texts = [ ] embeddings = [text_to_embedding(text) for text in texts] # 使用文本模型生成向量 +# 打印生成的向量值 +for text, embedding in zip(texts, embeddings): + print(f"文本: {text}, 向量: {embedding[:5]}...") # 打印前 5 维 + # 5. 插入数据,确保字段顺序与集合定义一致 entities = [texts, embeddings] # 第一个列表是 text 字段,第二个列表是 embedding 字段 collection_manager.insert_data(entities) diff --git a/AI/WxMini/Milvus/T4_select_all_data.py b/AI/WxMini/Milvus/T4_select_all_data.py index a174ba40..ce7228c1 100644 --- a/AI/WxMini/Milvus/T4_select_all_data.py +++ b/AI/WxMini/Milvus/T4_select_all_data.py @@ -1,5 +1,3 @@ -import random - from WxMini.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager from WxMini.Milvus.Utils.MilvusConnectionPool import * from WxMini.Milvus.Config.MulvusConfig import * @@ -18,21 +16,27 @@ collection_manager = MilvusCollectionManager(collection_name) collection_manager.load_collection() print(f"集合 '{collection_name}' 已加载到内存。") -# 5. 查询数据 -query_vector = [random.random() for _ in range(128)] # 随机生成一个查询向量 -search_params = { - "metric_type": "L2", # 使用 L2 距离度量方式 - "params": {"nprobe": 10} # 设置 IVF_FLAT 的 nprobe 参数 -} -results = collection_manager.search(query_vector, search_params, limit=200) -print("查询结果:") -if results: - for hits in results: - for hit in hits: - text = collection_manager.query_text_by_id(hit.id) # 获取 text 字段 - print(f"ID: {hit.id}, Text: {text}, Distance: {hit.distance}") -else: - print("未找到相关数据,请检查查询参数或数据。") +# 5. 查询所有数据 +try: + # 使用 Milvus 的 query 方法查询所有数据 + results = collection_manager.collection.query( + expr="", # 空表达式表示查询所有数据 + output_fields=["id", "text", "embedding"], # 指定返回的字段 + limit=1000 # 设置最大返回记录数 + ) + print("查询结果:") + if results: + for result in results: + try: + text = result["text"] # 获取 text 字段 + embedding = result["embedding"] # 获取 embedding 字段 + print(f"ID: {result['id']}, Text: {text}, Embedding: {embedding[:5]}...") # 只打印前 5 维向量 + except Exception as e: + print(f"查询失败: {e}") + else: + print("未找到相关数据,请检查查询参数或数据。") +except Exception as e: + print(f"查询失败: {e}") # 6. 释放连接 milvus_pool.release_connection(connection)