from Config.Config import * from Backup.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager from gensim.models import KeyedVectors import jieba import os import time # 需要进行标记的标签 selectedTags = ["CHINESE_DATA_1", "高中语文文言文"] # 1. 加载预训练的 Word2Vec 模型 model_path = MS_MODEL_PATH model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=MS_MODEL_LIMIT) print(f"模型加载成功,词向量维度: {model.vector_size}") # 功能:将文本转换为嵌入向量 def text_to_embedding(text): words = jieba.lcut(text) embeddings = [model[word] for word in words if word in model] if embeddings: return sum(embeddings) / len(embeddings) return [0.0] * model.vector_size # 2. 使用连接池管理 Milvus 连接 milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS) connection = milvus_pool.get_connection() # 3. 初始化集合管理器 collection_name = MS_COLLECTION_NAME collection_manager = MilvusCollectionManager(collection_name) # 4. 处理processed_chunks目录下的所有文件 txt_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'Txt', 'processed_chunks') for filename in os.listdir(txt_dir): if filename.endswith('.txt'): filepath = os.path.join(txt_dir, filename) with open(filepath, 'r', encoding='utf-8') as f: # 只读取第一行作为向量计算 first_line = f.readline().strip() # 读取全部内容用于后续查询 full_content = first_line + '\n' + f.read() if not first_line: print(f"跳过空文件: {filename}") continue print(f"正在处理文件: {filename}") # 5. 获取当前时间和会话ID timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) tags = {"tags": selectedTags, "full_content": full_content} # 添加完整内容 # 6. 将第一行文本转换为嵌入向量 embedding = text_to_embedding(first_line) # 7. 插入数据 entities = [ [tags], # tags [first_line], # user_input [timestamp], # timestamp [embedding] # embedding ] collection_manager.insert_data(entities) print(f"文件 {filename} 数据插入成功") # 8. 释放连接 (移出循环外) milvus_pool.release_connection(connection) milvus_pool.close() print("所有文件处理完成")