diff --git a/dsRag/Util/EsSearchUtil.py b/dsRag/Util/EsSearchUtil.py index d2aeece3..6ded7acc 100644 --- a/dsRag/Util/EsSearchUtil.py +++ b/dsRag/Util/EsSearchUtil.py @@ -1,6 +1,9 @@ -from elasticsearch import Elasticsearch import jieba -import numpy as np +from elasticsearch import Elasticsearch +from gensim.models import KeyedVectors + +from Config.Config import MS_MODEL_PATH, MS_MODEL_LIMIT + class EsSearchUtil: def __init__(self, es_config): @@ -16,9 +19,18 @@ class EsSearchUtil: ) def text_to_embedding(self, text): - # 当前实现为随机向量生成,后续可替换为实际模型 - vector = np.random.rand(200).tolist() - return vector + # 加载预训练模型 + model = KeyedVectors.load_word2vec_format(MS_MODEL_PATH, binary=False, limit=MS_MODEL_LIMIT) + # 对文本分词并计算平均向量 + words = jieba.lcut(text) + vectors = [model[word] for word in words if word in model] + + if not vectors: + return [0.0] * model.vector_size + + # 计算平均向量 + avg_vector = [sum(dim)/len(vectors) for dim in zip(*vectors)] + return avg_vector def vector_search(self, query, size=10): query_embedding = self.text_to_embedding(query) diff --git a/dsRag/Util/__pycache__/EsSearchUtil.cpython-310.pyc b/dsRag/Util/__pycache__/EsSearchUtil.cpython-310.pyc index b0a8cb27..f57645b7 100644 Binary files a/dsRag/Util/__pycache__/EsSearchUtil.cpython-310.pyc and b/dsRag/Util/__pycache__/EsSearchUtil.cpython-310.pyc differ