main
HuangHai 4 weeks ago
parent dc312b0852
commit c9bf55b7fe

@ -2,9 +2,7 @@ import logging
import os
from logging.handlers import RotatingFileHandler
from gensim.models import KeyedVectors
from Config.Config import ES_CONFIG, MS_MODEL_PATH, MS_MODEL_LIMIT
from Config.Config import ES_CONFIG
from ElasticSearch.Utils.ElasticsearchConnectionPool import ElasticsearchConnectionPool
# 初始化日志
@ -16,11 +14,6 @@ handler = RotatingFileHandler('Logs/start.log', maxBytes=1024 * 1024, backupCoun
handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
# 1. 加载预训练的 Word2Vec 模型
model = KeyedVectors.load_word2vec_format(MS_MODEL_PATH, binary=False, limit=MS_MODEL_LIMIT)
logger.info(f"模型加载成功,词向量维度: {model.vector_size}")
def init_es_pool():
# 初始化Elasticsearch连接池

@ -1,11 +1,21 @@
import logging
import os
from logging.handlers import RotatingFileHandler
import jieba
from elasticsearch import Elasticsearch
from gensim.models import KeyedVectors
from Config.Config import MS_MODEL_PATH, MS_MODEL_LIMIT
# 初始化日志
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# 确保日志目录存在
os.makedirs('Logs', exist_ok=True)
handler = RotatingFileHandler('Logs/start.log', maxBytes=1024 * 1024, backupCount=5)
handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
class EsSearchUtil:
def __init__(self, es_config):
"""
初始化Elasticsearch搜索工具
@ -18,15 +28,30 @@ class EsSearchUtil:
verify_certs=False
)
def text_to_embedding(self, text):
def __init__(self, es_config):
from gensim.models import KeyedVectors
from Config.Config import MS_MODEL_PATH, MS_MODEL_LIMIT
# 加载预训练模型
model = KeyedVectors.load_word2vec_format(MS_MODEL_PATH, binary=False, limit=MS_MODEL_LIMIT)
self.model = KeyedVectors.load_word2vec_format(MS_MODEL_PATH, binary=False, limit=MS_MODEL_LIMIT)
logger.info(f"模型加载成功,词向量维度: {self.model.vector_size}")
# 初始化Elasticsearch连接
self.es = Elasticsearch(
hosts=es_config['hosts'],
basic_auth=es_config['basic_auth'],
verify_certs=False
)
self.index_name = es_config['index_name']
def text_to_embedding(self, text):
# 使用已加载的模型
# 对文本分词并计算平均向量
words = jieba.lcut(text)
vectors = [model[word] for word in words if word in model]
vectors = [self.model[word] for word in words if word in self.model]
if not vectors:
return [0.0] * model.vector_size
return [0.0] * self.model.vector_size
# 计算平均向量
avg_vector = [sum(dim)/len(vectors) for dim in zip(*vectors)]

Loading…
Cancel
Save