|
|
|
@ -1,11 +1,21 @@
|
|
|
|
|
import logging
|
|
|
|
|
import os
|
|
|
|
|
from logging.handlers import RotatingFileHandler
|
|
|
|
|
|
|
|
|
|
import jieba
|
|
|
|
|
from elasticsearch import Elasticsearch
|
|
|
|
|
from gensim.models import KeyedVectors
|
|
|
|
|
|
|
|
|
|
from Config.Config import MS_MODEL_PATH, MS_MODEL_LIMIT
|
|
|
|
|
|
|
|
|
|
# 初始化日志
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
logger.setLevel(logging.INFO)
|
|
|
|
|
# 确保日志目录存在
|
|
|
|
|
os.makedirs('Logs', exist_ok=True)
|
|
|
|
|
handler = RotatingFileHandler('Logs/start.log', maxBytes=1024 * 1024, backupCount=5)
|
|
|
|
|
handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
|
|
|
|
logger.addHandler(handler)
|
|
|
|
|
|
|
|
|
|
class EsSearchUtil:
|
|
|
|
|
|
|
|
|
|
def __init__(self, es_config):
|
|
|
|
|
"""
|
|
|
|
|
初始化Elasticsearch搜索工具
|
|
|
|
@ -18,15 +28,30 @@ class EsSearchUtil:
|
|
|
|
|
verify_certs=False
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def text_to_embedding(self, text):
|
|
|
|
|
def __init__(self, es_config):
|
|
|
|
|
from gensim.models import KeyedVectors
|
|
|
|
|
from Config.Config import MS_MODEL_PATH, MS_MODEL_LIMIT
|
|
|
|
|
|
|
|
|
|
# 加载预训练模型
|
|
|
|
|
model = KeyedVectors.load_word2vec_format(MS_MODEL_PATH, binary=False, limit=MS_MODEL_LIMIT)
|
|
|
|
|
self.model = KeyedVectors.load_word2vec_format(MS_MODEL_PATH, binary=False, limit=MS_MODEL_LIMIT)
|
|
|
|
|
logger.info(f"模型加载成功,词向量维度: {self.model.vector_size}")
|
|
|
|
|
|
|
|
|
|
# 初始化Elasticsearch连接
|
|
|
|
|
self.es = Elasticsearch(
|
|
|
|
|
hosts=es_config['hosts'],
|
|
|
|
|
basic_auth=es_config['basic_auth'],
|
|
|
|
|
verify_certs=False
|
|
|
|
|
)
|
|
|
|
|
self.index_name = es_config['index_name']
|
|
|
|
|
|
|
|
|
|
def text_to_embedding(self, text):
|
|
|
|
|
# 使用已加载的模型
|
|
|
|
|
# 对文本分词并计算平均向量
|
|
|
|
|
words = jieba.lcut(text)
|
|
|
|
|
vectors = [model[word] for word in words if word in model]
|
|
|
|
|
vectors = [self.model[word] for word in words if word in self.model]
|
|
|
|
|
|
|
|
|
|
if not vectors:
|
|
|
|
|
return [0.0] * model.vector_size
|
|
|
|
|
return [0.0] * self.model.vector_size
|
|
|
|
|
|
|
|
|
|
# 计算平均向量
|
|
|
|
|
avg_vector = [sum(dim)/len(vectors) for dim in zip(*vectors)]
|
|
|
|
|