From e0867e724432e9a3505bd0adaebee6035eb66c4d Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Thu, 26 Jun 2025 10:51:06 +0800 Subject: [PATCH] 'commit' --- dsRag/Milvus/X1_create_collection.py | 4 +- dsRag/Milvus/X6_InsertMathData.py | 68 ++++++++++++++++++++++++++++ dsRag/Milvus/requirements.txt | 14 ------ 3 files changed, 70 insertions(+), 16 deletions(-) create mode 100644 dsRag/Milvus/X6_InsertMathData.py delete mode 100644 dsRag/Milvus/requirements.txt diff --git a/dsRag/Milvus/X1_create_collection.py b/dsRag/Milvus/X1_create_collection.py index de527cda..aa58e2c7 100644 --- a/dsRag/Milvus/X1_create_collection.py +++ b/dsRag/Milvus/X1_create_collection.py @@ -28,8 +28,8 @@ if utility.has_collection(collection_name): fields = [ FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), # 主键字段,自动生成 ID FieldSchema(name="person_id", dtype=DataType.VARCHAR, max_length=64), # 会话 ID - FieldSchema(name="user_input", dtype=DataType.VARCHAR, max_length=2048), # 用户问题 - FieldSchema(name="model_response", dtype=DataType.VARCHAR, max_length=2048), # 大模型反馈结果 + FieldSchema(name="user_input", dtype=DataType.VARCHAR, max_length=65535), # 用户问题 + FieldSchema(name="model_response", dtype=DataType.VARCHAR, max_length=65535), # 大模型反馈结果 FieldSchema(name="timestamp", dtype=DataType.VARCHAR, max_length=32), # 时间 FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=MS_DIMENSION) # 向量字段,维度为 200 ] diff --git a/dsRag/Milvus/X6_InsertMathData.py b/dsRag/Milvus/X6_InsertMathData.py new file mode 100644 index 00000000..68fd0e35 --- /dev/null +++ b/dsRag/Milvus/X6_InsertMathData.py @@ -0,0 +1,68 @@ +from Milvus.Config.MulvusConfig import * +from Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager +from Milvus.Utils.MilvusConnectionPool import * +from gensim.models import KeyedVectors +import jieba +import os +import time + +# 1. 加载预训练的 Word2Vec 模型 +model_path = MS_MODEL_PATH +model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=MS_MODEL_LIMIT) +print(f"模型加载成功,词向量维度: {model.vector_size}") + + +# 功能:将文本转换为嵌入向量 +def text_to_embedding(text): + words = jieba.lcut(text) + embeddings = [model[word] for word in words if word in model] + if embeddings: + return sum(embeddings) / len(embeddings) + return [0.0] * model.vector_size + + +# 2. 使用连接池管理 Milvus 连接 +milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS) +connection = milvus_pool.get_connection() + +# 3. 初始化集合管理器 +collection_name = MS_COLLECTION_NAME +collection_manager = MilvusCollectionManager(collection_name) + +# 4. 处理processed_chunks目录下的所有文件 +txt_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'Txt', 'processed_chunks') + +for filename in os.listdir(txt_dir): + if filename.endswith('.txt'): + filepath = os.path.join(txt_dir, filename) + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read().strip() + + if not content: + print(f"跳过空文件: {filename}") + continue + + print(f"正在处理文件: {filename}") + + # 5. 获取当前时间和会话ID + timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + person_id = "MATH_DATA_" + str(hash(filename)) + + # 6. 将文本转换为嵌入向量 + embedding = text_to_embedding(content) + + # 7. 插入数据 + entities = [ + [person_id], # person_id + [content], # user_input + [""], # model_response (留空) + [timestamp], # timestamp + [embedding] # embedding + ] + collection_manager.insert_data(entities) + print(f"文件 {filename} 数据插入成功") + +# 8. 释放连接 (移出循环外) +milvus_pool.release_connection(connection) +milvus_pool.close() +print("所有文件处理完成") diff --git a/dsRag/Milvus/requirements.txt b/dsRag/Milvus/requirements.txt deleted file mode 100644 index 13a8f357..00000000 --- a/dsRag/Milvus/requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ -# 基础依赖 -gensim==4.3.3 -jieba==0.42.1 -pymilvus==2.5.6 -aiomysql==0.2.0 -numpy==1.23.5 -alibabacloud_imagerecog20190930==2.0.10 -alibabacloud_tea_openapi==0.0.2 -alibabacloud_sts20150401==1.1.4 -alibabacloud_credentials==2.2.1 -python-jose[cryptography]==2.21 -passlib[bcrypt]== 0.6.1 -alibabacloud_iqs20241111==1.1.5 -