'commit'

4 months ago · 1c7a99ab59
parent 94d7fb817d
commit 1c7a99ab59
6 changed files with 54 additions and 44 deletions
--- a/AI/WxMini/Milvus/T1_create_collection.py
+++ b/AI/WxMini/Milvus/T1_create_collection.py
@ -1,3 +1,4 @@
+# pip install sentence-transformers
 from pymilvus import FieldSchema, DataType, utility

 from WxMini.Milvus.Config.MulvusConfig import *
@ -24,7 +25,7 @@ if utility.has_collection(collection_name):
 fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),  # 主键字段，自动生成 ID
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=500),  # 存储对话文本
-    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128)  # 向量字段，维度为 128
+    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=200)  # 向量字段，维度为 200
 ]
 schema_description = "Simple demo collection"

--- a/AI/WxMini/Milvus/T2_create_index.py
+++ b/AI/WxMini/Milvus/T2_create_index.py
@ -20,7 +20,6 @@ index_params = {
    "params": {"nlist": 128}   # 设置 IVF_FLAT 的 nlist 参数
 }
 collection_manager.create_index("embedding", index_params)
-print("索引创建成功。")

 # 5. 释放连接
 milvus_pool.release_connection(connection)
--- a/AI/WxMini/Milvus/T3_insert_data.py
+++ b/AI/WxMini/Milvus/T3_insert_data.py
@ -1,8 +1,20 @@
-import random
-
 from WxMini.Milvus.Config.MulvusConfig import *
 from WxMini.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager
 from WxMini.Milvus.Utils.MilvusConnectionPool import *
+from gensim.models import KeyedVectors
+
+# 加载预训练的 Word2Vec 模型
+model_path = "D:/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt"  # 替换为你的 Word2Vec 模型路径
+model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000)
+
+# 将文本转换为嵌入向量
+def text_to_embedding(text):
+    words = text.split()
+    embeddings = [model[word] for word in words if word in model]
+    if embeddings:
+        return sum(embeddings) / len(embeddings)  # 取词向量的平均值
+    else:
+        return [0.0] * model.vector_size  # 如果文本中没有有效词，返回零向量

 # 1. 使用连接池管理 Milvus 连接
 milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)
@ -17,41 +29,19 @@ collection_manager = MilvusCollectionManager(collection_name)
 # 4. 插入数据
 texts = [
    "我今天心情不太好，因为工作压力很大。",  # 第一个对话文本
-    "我最近在学习 Python，感觉很有趣。",   # 第二个对话文本
-    "我打算周末去爬山，放松一下。"         # 第三个对话文本
-]
-embeddings = [
-    [random.random() for _ in range(128)],  # 第一个 128 维向量
-    [random.random() for _ in range(128)],  # 第二个 128 维向量
-    [random.random() for _ in range(128)]   # 第三个 128 维向量
+    "我最近在学习 Python，感觉很有趣。",  # 第二个对话文本
+    "我打算周末去爬山，放松一下。",  # 第三个对话文本
+    "吉林省广告产业园是东师理想的办公地点。"  # 第四个对话文本
 ]
+embeddings = [text_to_embedding(text) for text in texts]  # 使用文本模型生成向量

-# 插入数据，确保字段顺序与集合定义一致
+# 5. 插入数据，确保字段顺序与集合定义一致
 entities = [texts, embeddings]  # 第一个列表是 text 字段，第二个列表是 embedding 字段
 collection_manager.insert_data(entities)
 print("数据插入成功。")

-# 5. 加载集合到内存
-collection_manager.load_collection()
-
-# 6. 查询数据，验证插入是否成功
-query_vector = [random.random() for _ in range(128)]  # 随机生成一个查询向量
-search_params = {
-    "metric_type": "L2",       # 使用 L2 距离度量方式
-    "params": {"nprobe": 10}   # 设置 IVF_FLAT 的 nprobe 参数
-}
-results = collection_manager.search(query_vector, search_params, limit=2)
-print("查询结果：")
-if results:
-    for hits in results:
-        for hit in hits:
-            text = collection_manager.query_text_by_id(hit.id)
-            print(f"ID: {hit.id}, Text: {text}, Distance: {hit.distance}")
-else:
-    print("未找到相关数据，请检查查询参数或数据。")
-
-# 7. 释放连接
+# 6. 释放连接
 milvus_pool.release_connection(connection)

-# 8. 关闭连接池
+# 7. 关闭连接池
 milvus_pool.close()
--- a/AI/WxMini/Milvus/T4_select_all_data.py
+++ b/AI/WxMini/Milvus/T4_select_all_data.py
@ -24,7 +24,7 @@ search_params = {
    "metric_type": "L2",       # 使用 L2 距离度量方式
    "params": {"nprobe": 10}   # 设置 IVF_FLAT 的 nprobe 参数
 }
-results = collection_manager.search(query_vector, search_params, limit=20)
+results = collection_manager.search(query_vector, search_params, limit=200)
 print("查询结果：")
 if results:
    for hits in results:
--- a/AI/WxMini/Milvus/T5_search_near_word.py
+++ b/AI/WxMini/Milvus/T5_search_near_word.py
@ -1,8 +1,25 @@
-import numpy as np
+# pip install gensim
 import time
 from WxMini.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager
 from WxMini.Milvus.Utils.MilvusConnectionPool import *
 from WxMini.Milvus.Config.MulvusConfig import *
+from gensim.models import KeyedVectors
+# 加载预训练的 Word2Vec 模型
+model_path = "D:/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt"  # 替换为你的 Word2Vec 模型路径
+# 参考文档：使用gensim之KeyedVectors操作词向量模型
+# https://www.cnblogs.com/bill-h/p/14655224.html
+# 读取词向量模型（限定前10000个词）
+model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000)
+
+# 将文本转换为嵌入向量
+def text_to_embedding(text):
+    words = text.split()
+    embeddings = [model[word] for word in words if word in model]
+    if embeddings:
+        return sum(embeddings) / len(embeddings)  # 取词向量的平均值
+    else:
+        return [0.0] * model.vector_size  # 如果文本中没有有效词，返回零向量
+
 # 1. 使用连接池管理 Milvus 连接
 milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)

@ -16,20 +33,22 @@ collection_manager = MilvusCollectionManager(collection_name)
 # 4. 加载集合到内存
 collection_manager.load_collection()

-# 5. 模拟当前对话的嵌入向量
-current_embedding = np.random.random(128).tolist()  # 随机生成一个 128 维向量
+# 5. 输入一句话
+input_text = input("请输入一句话：")  # 例如：“我今天心情不太好”
+
+# 6. 将文本转换为嵌入向量
+current_embedding = text_to_embedding(input_text)

-# 6. 查询与当前对话最相关的历史对话
+# 7. 查询与当前对话最相关的历史对话
 search_params = {
    "metric_type": "L2",       # 使用 L2 距离度量方式
    "params": {"nprobe": 100}  # 设置 IVF_FLAT 的 nprobe 参数
 }
 start_time = time.time()
-results = collection_manager.search(current_embedding, search_params, limit=2)
+results = collection_manager.search(current_embedding, search_params, limit=5)  # 返回 5 条结果
 end_time = time.time()

-# 7. 输出查询结果
-#print("当前对话的嵌入向量:", current_embedding)
+# 8. 输出查询结果
 print("最相关的历史对话:")
 if results:
    for hits in results:
@ -42,11 +61,11 @@ if results:
 else:
    print("未找到相关历史对话，请检查查询参数或数据。")

-# 8. 输出查询耗时
+# 9. 输出查询耗时
 print(f"查询耗时: {end_time - start_time:.4f} 秒")

-# 9. 释放连接
+# 10. 释放连接
 milvus_pool.release_connection(connection)

-# 10. 关闭连接池
+# 11. 关闭连接池
 milvus_pool.close()
--- a/AI/WxMini/安装.txt
+++ b/AI/WxMini/安装.txt
@ -0,0 +1 @@
+# 腾讯 AI Lab 的中文词向量