You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

68 lines
2.4 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from Milvus.Config.MulvusConfig import *
from Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager
from Milvus.Utils.MilvusConnectionPool import *
from gensim.models import KeyedVectors
import jieba
import os
import time
# 1. 加载预训练的 Word2Vec 模型
model_path = MS_MODEL_PATH
model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=MS_MODEL_LIMIT)
print(f"模型加载成功,词向量维度: {model.vector_size}")
# 功能:将文本转换为嵌入向量
def text_to_embedding(text):
words = jieba.lcut(text)
embeddings = [model[word] for word in words if word in model]
if embeddings:
return sum(embeddings) / len(embeddings)
return [0.0] * model.vector_size
# 2. 使用连接池管理 Milvus 连接
milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)
connection = milvus_pool.get_connection()
# 3. 初始化集合管理器
collection_name = MS_COLLECTION_NAME
collection_manager = MilvusCollectionManager(collection_name)
# 4. 处理processed_chunks目录下的所有文件
txt_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'Txt', 'processed_chunks')
for filename in os.listdir(txt_dir):
if filename.endswith('.txt'):
filepath = os.path.join(txt_dir, filename)
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read().strip()
if not content:
print(f"跳过空文件: {filename}")
continue
print(f"正在处理文件: {filename}")
# 5. 获取当前时间和会话ID
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
tags = {"tags": ["MATH_DATA_1", "小学数学"]} # 直接使用Python字典Milvus会自动转换为JSON
# 6. 将文本转换为嵌入向量
embedding = text_to_embedding(content)
# 7. 插入数据
entities = [
[tags], # tags
[content], # user_input
[timestamp], # timestamp
[embedding] # embedding
]
collection_manager.insert_data(entities)
print(f"文件 {filename} 数据插入成功")
# 8. 释放连接 (移出循环外)
milvus_pool.release_connection(connection)
milvus_pool.close()
print("所有文件处理完成")