From 1c7a99ab595074c2e16473cde1c80dec2dc994c2 Mon Sep 17 00:00:00 2001
From: HuangHai <10402852@qq.com>
Date: Mon, 24 Mar 2025 10:20:16 +0800
Subject: [PATCH] 'commit'

---
 AI/WxMini/Milvus/T1_create_collection.py      |  3 +-
 AI/WxMini/Milvus/T2_create_index.py           |  1 -
 AI/WxMini/Milvus/T3_insert_data.py            | 52 ++++++++-----------
 ...4_search_data.py => T4_select_all_data.py} |  2 +-
 AI/WxMini/Milvus/T5_search_near_word.py       | 39 ++++++++++----
 AI/WxMini/安装.txt                          |  1 +
 6 files changed, 54 insertions(+), 44 deletions(-)
 rename AI/WxMini/Milvus/{T4_search_data.py => T4_select_all_data.py} (95%)
 create mode 100644 AI/WxMini/安装.txt

diff --git a/AI/WxMini/Milvus/T1_create_collection.py b/AI/WxMini/Milvus/T1_create_collection.py
index be3b4013..3e3fdce8 100644
--- a/AI/WxMini/Milvus/T1_create_collection.py
+++ b/AI/WxMini/Milvus/T1_create_collection.py
@@ -1,3 +1,4 @@
+# pip install sentence-transformers
 from pymilvus import FieldSchema, DataType, utility
 
 from WxMini.Milvus.Config.MulvusConfig import *
@@ -24,7 +25,7 @@ if utility.has_collection(collection_name):
 fields = [
     FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),  # 主键字段，自动生成 ID
     FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=500),  # 存储对话文本
-    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128)  # 向量字段，维度为 128
+    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=200)  # 向量字段，维度为 200
 ]
 schema_description = "Simple demo collection"
 
diff --git a/AI/WxMini/Milvus/T2_create_index.py b/AI/WxMini/Milvus/T2_create_index.py
index 57417ec9..bfc820c3 100644
--- a/AI/WxMini/Milvus/T2_create_index.py
+++ b/AI/WxMini/Milvus/T2_create_index.py
@@ -20,7 +20,6 @@ index_params = {
     "params": {"nlist": 128}   # 设置 IVF_FLAT 的 nlist 参数
 }
 collection_manager.create_index("embedding", index_params)
-print("索引创建成功。")
 
 # 5. 释放连接
 milvus_pool.release_connection(connection)
diff --git a/AI/WxMini/Milvus/T3_insert_data.py b/AI/WxMini/Milvus/T3_insert_data.py
index 67a157f5..16b844d9 100644
--- a/AI/WxMini/Milvus/T3_insert_data.py
+++ b/AI/WxMini/Milvus/T3_insert_data.py
@@ -1,8 +1,20 @@
-import random
-
 from WxMini.Milvus.Config.MulvusConfig import *
 from WxMini.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager
 from WxMini.Milvus.Utils.MilvusConnectionPool import *
+from gensim.models import KeyedVectors
+
+# 加载预训练的 Word2Vec 模型
+model_path = "D:/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt"  # 替换为你的 Word2Vec 模型路径
+model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000)
+
+# 将文本转换为嵌入向量
+def text_to_embedding(text):
+    words = text.split()
+    embeddings = [model[word] for word in words if word in model]
+    if embeddings:
+        return sum(embeddings) / len(embeddings)  # 取词向量的平均值
+    else:
+        return [0.0] * model.vector_size  # 如果文本中没有有效词，返回零向量
 
 # 1. 使用连接池管理 Milvus 连接
 milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)
@@ -17,41 +29,19 @@ collection_manager = MilvusCollectionManager(collection_name)
 # 4. 插入数据
 texts = [
     "我今天心情不太好，因为工作压力很大。",  # 第一个对话文本
-    "我最近在学习 Python，感觉很有趣。",   # 第二个对话文本
-    "我打算周末去爬山，放松一下。"         # 第三个对话文本
-]
-embeddings = [
-    [random.random() for _ in range(128)],  # 第一个 128 维向量
-    [random.random() for _ in range(128)],  # 第二个 128 维向量
-    [random.random() for _ in range(128)]   # 第三个 128 维向量
+    "我最近在学习 Python，感觉很有趣。",  # 第二个对话文本
+    "我打算周末去爬山，放松一下。",  # 第三个对话文本
+    "吉林省广告产业园是东师理想的办公地点。"  # 第四个对话文本
 ]
+embeddings = [text_to_embedding(text) for text in texts]  # 使用文本模型生成向量
 
-# 插入数据，确保字段顺序与集合定义一致
+# 5. 插入数据，确保字段顺序与集合定义一致
 entities = [texts, embeddings]  # 第一个列表是 text 字段，第二个列表是 embedding 字段
 collection_manager.insert_data(entities)
 print("数据插入成功。")
 
-# 5. 加载集合到内存
-collection_manager.load_collection()
-
-# 6. 查询数据，验证插入是否成功
-query_vector = [random.random() for _ in range(128)]  # 随机生成一个查询向量
-search_params = {
-    "metric_type": "L2",       # 使用 L2 距离度量方式
-    "params": {"nprobe": 10}   # 设置 IVF_FLAT 的 nprobe 参数
-}
-results = collection_manager.search(query_vector, search_params, limit=2)
-print("查询结果：")
-if results:
-    for hits in results:
-        for hit in hits:
-            text = collection_manager.query_text_by_id(hit.id)
-            print(f"ID: {hit.id}, Text: {text}, Distance: {hit.distance}")
-else:
-    print("未找到相关数据，请检查查询参数或数据。")
-
-# 7. 释放连接
+# 6. 释放连接
 milvus_pool.release_connection(connection)
 
-# 8. 关闭连接池
+# 7. 关闭连接池
 milvus_pool.close()
\ No newline at end of file
diff --git a/AI/WxMini/Milvus/T4_search_data.py b/AI/WxMini/Milvus/T4_select_all_data.py
similarity index 95%
rename from AI/WxMini/Milvus/T4_search_data.py
rename to AI/WxMini/Milvus/T4_select_all_data.py
index 3f84f22a..a174ba40 100644
--- a/AI/WxMini/Milvus/T4_search_data.py
+++ b/AI/WxMini/Milvus/T4_select_all_data.py
@@ -24,7 +24,7 @@ search_params = {
     "metric_type": "L2",       # 使用 L2 距离度量方式
     "params": {"nprobe": 10}   # 设置 IVF_FLAT 的 nprobe 参数
 }
-results = collection_manager.search(query_vector, search_params, limit=20)
+results = collection_manager.search(query_vector, search_params, limit=200)
 print("查询结果：")
 if results:
     for hits in results:
diff --git a/AI/WxMini/Milvus/T5_search_near_word.py b/AI/WxMini/Milvus/T5_search_near_word.py
index 354f9afd..0f129b4b 100644
--- a/AI/WxMini/Milvus/T5_search_near_word.py
+++ b/AI/WxMini/Milvus/T5_search_near_word.py
@@ -1,8 +1,25 @@
-import numpy as np
+# pip install gensim
 import time
 from WxMini.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager
 from WxMini.Milvus.Utils.MilvusConnectionPool import *
 from WxMini.Milvus.Config.MulvusConfig import *
+from gensim.models import KeyedVectors
+# 加载预训练的 Word2Vec 模型
+model_path = "D:/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt"  # 替换为你的 Word2Vec 模型路径
+# 参考文档：使用gensim之KeyedVectors操作词向量模型
+# https://www.cnblogs.com/bill-h/p/14655224.html
+# 读取词向量模型（限定前10000个词）
+model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000)
+
+# 将文本转换为嵌入向量
+def text_to_embedding(text):
+    words = text.split()
+    embeddings = [model[word] for word in words if word in model]
+    if embeddings:
+        return sum(embeddings) / len(embeddings)  # 取词向量的平均值
+    else:
+        return [0.0] * model.vector_size  # 如果文本中没有有效词，返回零向量
+
 # 1. 使用连接池管理 Milvus 连接
 milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)
 
@@ -16,20 +33,22 @@ collection_manager = MilvusCollectionManager(collection_name)
 # 4. 加载集合到内存
 collection_manager.load_collection()
 
-# 5. 模拟当前对话的嵌入向量
-current_embedding = np.random.random(128).tolist()  # 随机生成一个 128 维向量
+# 5. 输入一句话
+input_text = input("请输入一句话：")  # 例如：“我今天心情不太好”
+
+# 6. 将文本转换为嵌入向量
+current_embedding = text_to_embedding(input_text)
 
-# 6. 查询与当前对话最相关的历史对话
+# 7. 查询与当前对话最相关的历史对话
 search_params = {
     "metric_type": "L2",       # 使用 L2 距离度量方式
     "params": {"nprobe": 100}  # 设置 IVF_FLAT 的 nprobe 参数
 }
 start_time = time.time()
-results = collection_manager.search(current_embedding, search_params, limit=2)
+results = collection_manager.search(current_embedding, search_params, limit=5)  # 返回 5 条结果
 end_time = time.time()
 
-# 7. 输出查询结果
-#print("当前对话的嵌入向量:", current_embedding)
+# 8. 输出查询结果
 print("最相关的历史对话:")
 if results:
     for hits in results:
@@ -42,11 +61,11 @@ if results:
 else:
     print("未找到相关历史对话，请检查查询参数或数据。")
 
-# 8. 输出查询耗时
+# 9. 输出查询耗时
 print(f"查询耗时: {end_time - start_time:.4f} 秒")
 
-# 9. 释放连接
+# 10. 释放连接
 milvus_pool.release_connection(connection)
 
-# 10. 关闭连接池
+# 11. 关闭连接池
 milvus_pool.close()
\ No newline at end of file
diff --git a/AI/WxMini/安装.txt b/AI/WxMini/安装.txt
new file mode 100644
index 00000000..fbd1afab
--- /dev/null
+++ b/AI/WxMini/安装.txt
@@ -0,0 +1 @@
+# 腾讯 AI Lab 的中文词向量
\ No newline at end of file