From dc312b0852b6d5cb30464a90b4e176528d34d124 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Fri, 27 Jun 2025 14:45:10 +0800 Subject: [PATCH] 'commit' --- dsRag/Util/EsSearchUtil.py | 22 ++++++++++++++---- .../__pycache__/EsSearchUtil.cpython-310.pyc | Bin 2627 -> 3156 bytes 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/dsRag/Util/EsSearchUtil.py b/dsRag/Util/EsSearchUtil.py index d2aeece3..6ded7acc 100644 --- a/dsRag/Util/EsSearchUtil.py +++ b/dsRag/Util/EsSearchUtil.py @@ -1,6 +1,9 @@ -from elasticsearch import Elasticsearch import jieba -import numpy as np +from elasticsearch import Elasticsearch +from gensim.models import KeyedVectors + +from Config.Config import MS_MODEL_PATH, MS_MODEL_LIMIT + class EsSearchUtil: def __init__(self, es_config): @@ -16,9 +19,18 @@ class EsSearchUtil: ) def text_to_embedding(self, text): - # 当前实现为随机向量生成,后续可替换为实际模型 - vector = np.random.rand(200).tolist() - return vector + # 加载预训练模型 + model = KeyedVectors.load_word2vec_format(MS_MODEL_PATH, binary=False, limit=MS_MODEL_LIMIT) + # 对文本分词并计算平均向量 + words = jieba.lcut(text) + vectors = [model[word] for word in words if word in model] + + if not vectors: + return [0.0] * model.vector_size + + # 计算平均向量 + avg_vector = [sum(dim)/len(vectors) for dim in zip(*vectors)] + return avg_vector def vector_search(self, query, size=10): query_embedding = self.text_to_embedding(query) diff --git a/dsRag/Util/__pycache__/EsSearchUtil.cpython-310.pyc b/dsRag/Util/__pycache__/EsSearchUtil.cpython-310.pyc index b0a8cb27fee52a7b6c3b705436dac01b565a8512..f57645b76a53df43ffe44389e2a7afbeccdc4c35 100644 GIT binary patch delta 1327 zcmZWp&u`R56rML@d!60&LbppOZA57}RHigQ_;FeyK}ZXNK%=D`R-)U@jGtx-i-ffynL^g zcwU*|_viJmmVc@2)+RdvDLQB*oy4fX*+SLT>|zwleTo8 zG3jjcwkx5<{L()Jd*27`%|+cb3wvqMRlQ|&RPP2GfqWG7vP7jm*WP@qGymY$+`Y~h zHy+k+>)OHm-kte74|}`socis@2jG+tP`UZvaLpyozcS_Jll(f2nD2%Kf@M(&Xzz5X+3j?{1)4W7qA2AX=(Rihpjjb)?&{ABOHpz zN;~iaqo^%06m+QjpCMmoF&uQ&hPIA(2{e5+WRJ^RFhu-y zruYsU!YQ`PpYRs)#{gpxl}}yN);EcgsE{2eGKh-h8ph~yGYZqJm&B_x@8kSm+Gc`n zPS2$+qw1Ggz91T-sb-Y)x=}iVgIDo=3^m_A(PBaXq}W?LGvv1o8M57m503IgiXvT$ zwTOa&&s80hwUA+qbW7_+Ek}op)d^fbgH4U2@m)ow(TynS$_^cJ8dEw;iHf_KBE1xm z(e_FhEOy&g)LYARr9kbZ;bx%4X1MBGs!CB-7Cg1M6T~GY9gp)euY<*_FfR6H#cjCsF?zK{TOj&Jlu{v&Q(RU>_sqA=wu(a4`TTe5 z?CFELDN`Nq6;o4-v-yA4ci`vW+9zkvQNA#k>Ql-$fZ`}1dEdaR&;dIIg{ymi*^goN z0)0!Fz93( ztARTI_6Fueck?S_pLmm$%hfdsn#`lIbE9K4G=jdqLm|ToUryGik~Es%3I87zhUsiHf)W(MtuSO*le X+eCo1D43x)ol>n_R)s1Y=XLQv%6b|k delta 845 zcmZuv&ubGw6yBNL*=#o1*oI;wXqA+L+e)YxFIGJi(SyW;w}^&hXKA+VkMMS4jkHuy z@LcvENDm?&C4WLMg4dqrFR=fG^JYcc>cYHv^UXKk_ulNBwoaBBndiBN`aa!%J^E2S zYFvNTQ7dk65*rEW5rR35X^-mJ>{(9?ZgT6R!L1Lp$2fgq`0NZNd?LNa@gNsbkcR^Z zM&00O@!Gf3=ft+uHn75{_neAKzZ&2cJM;mui_diH&{WX28u^L}!kCeLa!d}Wpu6VG z;wqo9W8=WwHg`Jp=aVGi?e;g?919UR^1zaIKxZvVwkjXu0@lzkhiXWmqy0!AQE3JcNFf zjwb?cDcm=;3#_7owlL3IAbQE6rh6&%{rO4d71>@x8g_fJK16*C>liM{c%$hMBL2J3 z&3RBwNf)jnb4dkN*B@2bK%l&D-MXxm_I)uOhtU34UAOb7XxBEp7M`VW2T84>S6e@< zW2*1>qC