From cdb342172e977b46228abf3512b736f8fefd6b50 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 30 Jun 2025 07:51:20 +0800 Subject: [PATCH] 'commit' --- dsRag/Start.py | 2 +- dsRag/Util/EsSearchUtil.py | 124 +++++++++++++++++- dsRag/Util/SearchUtil.py | 124 ------------------ .../__pycache__/EsSearchUtil.cpython-310.pyc | Bin 3967 -> 6631 bytes .../__pycache__/SearchUtil.cpython-310.pyc | Bin 3066 -> 0 bytes 5 files changed, 123 insertions(+), 127 deletions(-) delete mode 100644 dsRag/Util/SearchUtil.py delete mode 100644 dsRag/Util/__pycache__/SearchUtil.cpython-310.pyc diff --git a/dsRag/Start.py b/dsRag/Start.py index 6394f36c..13b8d172 100644 --- a/dsRag/Start.py +++ b/dsRag/Start.py @@ -18,7 +18,7 @@ from starlette.responses import StreamingResponse from starlette.staticfiles import StaticFiles from Config import Config -from Util.SearchUtil import * +from Util.EsSearchUtil import * # 初始化日志 logger = logging.getLogger(__name__) diff --git a/dsRag/Util/EsSearchUtil.py b/dsRag/Util/EsSearchUtil.py index 8968bfb4..21fe14e5 100644 --- a/dsRag/Util/EsSearchUtil.py +++ b/dsRag/Util/EsSearchUtil.py @@ -4,7 +4,7 @@ from logging.handlers import RotatingFileHandler import jieba from gensim.models import KeyedVectors -from Config.Config import MODEL_LIMIT, MODEL_PATH +from Config.Config import MODEL_LIMIT, MODEL_PATH, ES_CONFIG from ElasticSearch.Utils.ElasticsearchConnectionPool import ElasticsearchConnectionPool # 初始化日志 @@ -125,4 +125,124 @@ class EsSearchUtil: elif search_type == 'text': return self.text_search(query, size) else: - return self.hybrid_search(query, size) \ No newline at end of file + return self.hybrid_search(query, size) + + def queryByEs(query, query_tags, logger): + # 获取EsSearchUtil实例 + es_search_util = EsSearchUtil(ES_CONFIG) + + # 执行混合搜索 + es_conn = es_search_util.es_pool.get_connection() + try: + # 向量搜索 + logger.info(f"\n=== 开始执行查询 ===") + logger.info(f"原始查询文本: {query}") + logger.info(f"查询标签: {query_tags}") + + logger.info("\n=== 向量搜索阶段 ===") + logger.info("1. 文本分词和向量化处理中...") + query_embedding = es_search_util.text_to_embedding(query) + logger.info(f"2. 生成的查询向量维度: {len(query_embedding)}") + logger.info(f"3. 前3维向量值: {query_embedding[:3]}") + + logger.info("4. 正在执行Elasticsearch向量搜索...") + vector_results = es_conn.search( + index=ES_CONFIG['index_name'], + body={ + "query": { + "script_score": { + "query": { + "bool": { + "should": [ + { + "terms": { + "tags.tags": query_tags + } + } + ], + "minimum_should_match": 1 + } + }, + "script": { + "source": "double score = cosineSimilarity(params.query_vector, 'embedding'); return score >= 0 ? score : 0", + "params": {"query_vector": query_embedding} + } + } + }, + "size": 3 + } + ) + logger.info(f"5. 向量搜索结果数量: {len(vector_results['hits']['hits'])}") + + # 文本精确搜索 + logger.info("\n=== 文本精确搜索阶段 ===") + logger.info("1. 正在执行Elasticsearch文本精确搜索...") + text_results = es_conn.search( + index=ES_CONFIG['index_name'], + body={ + "query": { + "bool": { + "must": [ + { + "match": { + "user_input": query + } + }, + { + "terms": { + "tags.tags": query_tags + } + } + ] + } + }, + "size": 3 + } + ) + logger.info(f"2. 文本搜索结果数量: {len(text_results['hits']['hits'])}") + + # 合并结果 + logger.info("\n=== 最终搜索结果 ===") + logger.info(f"向量搜索结果: {len(vector_results['hits']['hits'])}条") + for i, hit in enumerate(vector_results['hits']['hits'], 1): + logger.info(f" {i}. 文档ID: {hit['_id']}, 相似度分数: {hit['_score']:.2f}") + logger.info(f" 内容: {hit['_source']['user_input']}") + + logger.info("文本精确搜索结果:") + for i, hit in enumerate(text_results['hits']['hits']): + logger.info(f" {i + 1}. 文档ID: {hit['_id']}, 匹配分数: {hit['_score']:.2f}") + logger.info(f" 内容: {hit['_source']['user_input']}") + + # 去重处理:去除vector_results和text_results中重复的user_input + vector_sources = [hit['_source'] for hit in vector_results['hits']['hits']] + text_sources = [hit['_source'] for hit in text_results['hits']['hits']] + + # 构建去重后的结果 + unique_text_sources = [] + text_user_inputs = set() + + # 先处理text_results,保留所有 + for source in text_sources: + text_user_inputs.add(source['user_input']) + unique_text_sources.append(source) + + # 处理vector_results,只保留不在text_results中的 + unique_vector_sources = [] + for source in vector_sources: + if source['user_input'] not in text_user_inputs: + unique_vector_sources.append(source) + + # 计算优化掉的记录数量和节约的tokens + removed_count = len(vector_sources) - len(unique_vector_sources) + saved_tokens = sum(len(source['user_input']) for source in vector_sources + if source['user_input'] in text_user_inputs) + + logger.info(f"优化掉 {removed_count} 条重复记录,节约约 {saved_tokens} tokens") + + search_results = { + "vector_results": unique_vector_sources, + "text_results": unique_text_sources + } + return search_results + finally: + es_search_util.es_pool.release_connection(es_conn) \ No newline at end of file diff --git a/dsRag/Util/SearchUtil.py b/dsRag/Util/SearchUtil.py deleted file mode 100644 index 1c47d2b2..00000000 --- a/dsRag/Util/SearchUtil.py +++ /dev/null @@ -1,124 +0,0 @@ -from Config.Config import ES_CONFIG -from Util.EsSearchUtil import EsSearchUtil - - -def queryByEs(query, query_tags,logger): - # 获取EsSearchUtil实例 - es_search_util = EsSearchUtil(ES_CONFIG) - - # 执行混合搜索 - es_conn = es_search_util.es_pool.get_connection() - try: - # 向量搜索 - logger.info(f"\n=== 开始执行查询 ===") - logger.info(f"原始查询文本: {query}") - logger.info(f"查询标签: {query_tags}") - - logger.info("\n=== 向量搜索阶段 ===") - logger.info("1. 文本分词和向量化处理中...") - query_embedding = es_search_util.text_to_embedding(query) - logger.info(f"2. 生成的查询向量维度: {len(query_embedding)}") - logger.info(f"3. 前3维向量值: {query_embedding[:3]}") - - logger.info("4. 正在执行Elasticsearch向量搜索...") - vector_results = es_conn.search( - index=ES_CONFIG['index_name'], - body={ - "query": { - "script_score": { - "query": { - "bool": { - "should": [ - { - "terms": { - "tags.tags": query_tags - } - } - ], - "minimum_should_match": 1 - } - }, - "script": { - "source": "double score = cosineSimilarity(params.query_vector, 'embedding'); return score >= 0 ? score : 0", - "params": {"query_vector": query_embedding} - } - } - }, - "size": 3 - } - ) - logger.info(f"5. 向量搜索结果数量: {len(vector_results['hits']['hits'])}") - - # 文本精确搜索 - logger.info("\n=== 文本精确搜索阶段 ===") - logger.info("1. 正在执行Elasticsearch文本精确搜索...") - text_results = es_conn.search( - index=ES_CONFIG['index_name'], - body={ - "query": { - "bool": { - "must": [ - { - "match": { - "user_input": query - } - }, - { - "terms": { - "tags.tags": query_tags - } - } - ] - } - }, - "size": 3 - } - ) - logger.info(f"2. 文本搜索结果数量: {len(text_results['hits']['hits'])}") - - # 合并结果 - logger.info("\n=== 最终搜索结果 ===") - logger.info(f"向量搜索结果: {len(vector_results['hits']['hits'])}条") - for i, hit in enumerate(vector_results['hits']['hits'], 1): - logger.info(f" {i}. 文档ID: {hit['_id']}, 相似度分数: {hit['_score']:.2f}") - logger.info(f" 内容: {hit['_source']['user_input']}") - - logger.info("文本精确搜索结果:") - for i, hit in enumerate(text_results['hits']['hits']): - logger.info(f" {i + 1}. 文档ID: {hit['_id']}, 匹配分数: {hit['_score']:.2f}") - logger.info(f" 内容: {hit['_source']['user_input']}") - - # 去重处理:去除vector_results和text_results中重复的user_input - vector_sources = [hit['_source'] for hit in vector_results['hits']['hits']] - text_sources = [hit['_source'] for hit in text_results['hits']['hits']] - - # 构建去重后的结果 - unique_text_sources = [] - text_user_inputs = set() - - # 先处理text_results,保留所有 - for source in text_sources: - text_user_inputs.add(source['user_input']) - unique_text_sources.append(source) - - # 处理vector_results,只保留不在text_results中的 - unique_vector_sources = [] - for source in vector_sources: - if source['user_input'] not in text_user_inputs: - unique_vector_sources.append(source) - - # 计算优化掉的记录数量和节约的tokens - removed_count = len(vector_sources) - len(unique_vector_sources) - saved_tokens = sum(len(source['user_input']) for source in vector_sources - if source['user_input'] in text_user_inputs) - - logger.info(f"优化掉 {removed_count} 条重复记录,节约约 {saved_tokens} tokens") - - search_results = { - "vector_results": unique_vector_sources, - "text_results": unique_text_sources - } - return search_results - finally: - es_search_util.es_pool.release_connection(es_conn) - diff --git a/dsRag/Util/__pycache__/EsSearchUtil.cpython-310.pyc b/dsRag/Util/__pycache__/EsSearchUtil.cpython-310.pyc index 8d10dd8681357ae8e44821f8f98470b8fb12cf1c..5b37225ac2c0070aeda27e896b2c3ed68961e042 100644 GIT binary patch literal 6631 zcmb7JS#TW3d7hq`y#~7r5C9KRlsvW$a4Bjl!7naVB<1_N z_XMQGS#Wx~d;adf|GWR6)l|yT@XP;e{=};fYubO%p!*lW;2AvL4K!Titfb}Co8=hB zdPy%EIfJP=GiPGXC`HQATvX1PrC2$hilPVi+4vxj@)(cL>KGY3!7bj$le0!{$Q@c3#;Wx^wa6rF|H1}$hfUd9Of zzI9~Z{zJ~8gWo#%QfR5c3*UI@`7p8nsI<R6Ss!>W7tm-3!pEO>5S6i)1|Rw~$_ zSgpKJt(HRbP<70E2~ctOOwse5>dAVVrXKVByzsN7>R3_J2DBC8Yeo;_<@}jvCw&pfm>?_txjoAK;N?T5CO+|zEUl5da8uIJ^)@U=iBcK-|vp26cC zLgQ*Aq@2zf*UxD=gB#q$+vJkL=U6U+UYsY;i=qcMqZi{tJjK)K#d$v;0DOY413QPq zLNau~g83p!FlHF+6+mcpsn%SfDwY&jQBTJs5GF+6A3Xu&DD0kItHaFi6ZL zvmBX%r~~~=pcAtOV55OHj>n0$-hO;M;alAF`?w_`l74C-&HL~@Io^L+yP}g~^zO<{ z-r9}vB+jHc13U%l(zRdX*p|KO*OjluOKa#s7sp2Jp0z9IXO};^(YXG%jq^7@zI~~2>3n1E?@v@c-`j52JXesEx7!nW z&--3g@a=Z7!re0t%`RWRxCU7mkxN7CSl%lZoP5nc5hhQ&qIi7LDY(M-4y^11T^UoP z@e=*i44?`7p?P$`rt*4%5f}?oJ%`O0tJXnhqsNMsyqF9lrDD10*Wfj@rC%;K-uhwV z^@Y11eY7+)*LeLZD7*Ws*~Z+DS1$Z$`POe5zk6@g4y_I?hcoNKM4Lr|bSE^%TtAFq zasrl=OS#=pwJ5Rd>=Xh0=^S&VK5+<#Pl&jn=g;5yZ zm@7iFSUFw|`v3})cf4ZV6&nBs+c!P8bX;ttg@o#wCccW-C)s_YFLUp^RdEup7esX& zE75x~KlU;i!OPtik)4u6Eb3mXQLT|y}Xi~0y1o; zoz{d-?*JD5WBrtVRPijs#1_CpQ}pbP&?MPlik3^3im#y;TDwYckA-S^V$ZV}dxXl? z>N|U=_uVtT<5wNGe9Yw>?mN4yR4wF7-X3&X%g3-%{?;6wv_Kx6jw6q5O)($Gon}4= zm}d9(99s_!uT~C?l3U5>;z6tl4PGq6CSpooyw3RrX+5!C9()Tp7e>a5?y-E%EEQ_L zWPtdJ+@fBbfc4qbH;JGyLLBjOR{r#uqc)cN0w)29ZPZ8}ty@F?9(?nXXe?upnR=RS zFicjz|9@xN3Ptz=DNE}|jG<>Q9*_16v<0vQ3S^M-6tyBFKOGo&N9J`No!5(O3jTVW zjq7OPwc$CHBoTGsM;Ab5eb10@{H2ruNh%PdL!{ypdZ!`R{$|RhJm=3W}+&*OOKF=DA5q8 z*hOFa70&i4b1nMiywVf$A4-s^mcNm~^P8CWHlu+@(4+D(Kf~YWQ7aA~0sru6eRGdU zdzSY|lBC*}q>TZj11X_J%)bZMGj0T|h(E~6l(LE(mh{6hl88c!LJnNk1eA6Z>*4T; ze6gDv@`jkOso$o3HF14XOBJ_X0i;Lj7G-hi@IcC@9#Xo1dk7&S$kj-dB<)u!8R&~T ze3SHVvJa&UOwQ)vY%+T74*+b3#HEYx-(9@4^x=n%xfvCpKfXPqB9IzBvh?PKrTKqo zed`3%X2h1V8@1KNi7 zj@l}^-?{nD^82%DUj@JXcjwd%puGfYnr_NjxL&Q~dtp*Ct2u&u_&TT+Asfu;m>X_NPR(fxxZnk{pD}9xj-_qtmv3FYbMtJI^KZP{_|e<#PycV>NF#In z$qC4)$@5>|!u15&Frqj-0tZ999kG{Y-n2&)ks}}9UZW9zzM#8>+dg_Os_=9MRq;xis{72d;W zeO${`#2?oej74qAoMZLx2l@i+N?n-IgPZSMpfQFU=^2}C$cXG23z0xW0Ex(bqJA`p zO!uyc1rg+i@$*OodUi_4_2ziYXWW9zLZ(OQhjLqEj8YH^KYSe@3?g_B&Fg$v&cyk8 z^f%1vQ&EX^Bd|vKChUP{7501D=iR&tUy|R;xA6M{1U>BD!3>-jcW}fcj{AWl_9ZyJ z(#B!0J_-No6@$L|@y-b;Q{AV%^1#dbRD3EScmG-tzpU{Gf&_n%xWpeq?_tn0a@o8R zZSTGnvl-0Z*Wu);KgjjlnmD>n2n-K(VR$4!Qn!)z?qJ#eXJh#jf0Sp>X&oBm-O=KY z$=NnmIomg;O(p%rw02Po`f!T=snm?NUjNCNp2!2sGPuP8lD z2TA_aygs!qSod8mFkk_xz@#(4XLXIfr)B=)aE&6NwR`t&yK(z0vcP75k%7U+bdB!# zjW@2+r>&YECU&Sib&yOm+5>Ig#=gD?k6|r5pbw_Zq>H$FsHyIE|UtD0}+J zB^ADqem34Yw>qOccNM=@!W73iBP##cGTFB_YFbGsCa@J3LWAv<(kgr0#WX1Oc4$~S-TJ2r$9m% z$GXEUGD`DA(f4W;?xdU)_m=zuLS66^;NiWTxUB1=4il}pUv}O%SSV(eNy0Ta|P=lz^EnlHoeEZ-&kZL$ZUK_&f zc6<5q%{#YmBd3O*0PU!qi>ixv-N^1dF0~I^;MaKVeB;J%u{h?aOL6TU!Xd9g-CRw^035~hM=}AKT16=<4e3DV@S}U7^uvUjP z%A@{`HlhS@SgUXA31{*~_Uz<-?~4+@FFllafiV9yHS!|#V!J1Ybi0)KTS%Rk(0t)e zq@>BC{l-QQk^W$EnoHuE_f?GH8@=j6COIG(R!nR=6O)mKzjq$<2Fi*G`#1m3dhz zlOhiy$lh~l)OVVfp&BYIDjlIDwR6c0Sy7PofLUerY9E>RXr+Zn=8e(rSgZ++ zJh2HKZN5K|D+kPv0lC34?q{3X!+0!x6WfOO24e#=QF)3RX_iJWjuyL}9l-o1LtXw& zbBkWz@Wt9idb|TvK#!lpOZ6=(a8WJNaYD;+P+Y5(sGoG4Q?SzQ#b5^_@FFb+F7L^b=+Jb3uP5vuc_ba_#reAzrIDkiRfK$a^JNy9{^ vYr%Y)|3{&$vBM_Wlj;`VrFg%G0i8z$E@GI<3Wjxf4kO9ZC=fyO*r5JD;i-N$ delta 1034 zcmZ8fzi-n}5cae4(}|tbr6DQ(VHag^OG8T$q99QUm5`u92s$7$L}}h5brmP!*#X+6 z16=@hC~snG$F>uSSef}7EIK2mZme+UphB_a^WAs%eV^~?eV_X^rnfasmEbvkR$W^j z*wshK;qKJp7Llk#<4x7eI2j@_X)nvPed6d$-^(#+J-<%?CF=t;L6bDKBSWRoG*xMa zW_J|FV8-4cu+C9nMYH3w;iCJ<;mc%zQGMz`SZQ{gfo~CFTvIJz{2w9R2P>t%4rDzTmjo!5Ug=934W402;eAax<3&=4b97x>kjmGCl*~C8K7w{v8a%Cor zy=rp`&^SOG+uRb}#D_UdVMOt}UZ^x#%a-{?fI@}39_*NW$?8Fy`+q3ND~lV+F$1Of zSTwomcI+4*7wgGU!vq8*A{QG+Eb%enu!kYP{z8fW%*P8Q*|L zox7cY<7Nj+7)6djidE*VGD_W6 z!^!rw#iMjGW8zp8EU}sv_t7tmd$pjx7OGvJ@v7VEbOU}3a9fG&_&jp(lmUn~JN4=- znb@~_3+e+xZ1FxbF>@1e)5-J~ zuLc_(7A^a)2u{t5BW%+0S)2ZnCU*Y@3(G8oL3n+< zn<*(pQj@Bt8fsF{MLqkx_*5*HCZX$S4c78qui~{SYx-OPEEFfjqR5u66cSja8!T9E pHyVudyHN8Hu~J$dLXUrZO)M?(xnvr7IZIR}OLSQ$d7=_S{tXsA+a>@2 diff --git a/dsRag/Util/__pycache__/SearchUtil.cpython-310.pyc b/dsRag/Util/__pycache__/SearchUtil.cpython-310.pyc deleted file mode 100644 index 006f496a706bf6e5625f7b1bf16d1c4062ec5e4c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3066 zcma)8-ESMm5#QZAp2!nPQMN2gj_h2%Y%_5fevL&}ts-#U2)KV=asZ#^769j3I#A;sqKiq3gQbHf{ke472c4zk}lCoo< zC*AJO?9BXTc4miXClV@xXKW-l`D`CT|KUXELxA`hyv~y_P(*PNm9UO6Uy~v!NxB4U ziOO?GmuX}irIjs^L}?tvPLF3_dS&bvFaHvV_-SX{$g%w7tFBqhOI)P+?1Gv5z*8NMQstDA*fU3{|umSIV=z}eP_uYpVr;pGfy5C0}uLBt{fo@}T35)}P z(e)6FM_U-${hrXrRwcgcM%y0Jo>_7Voq1w{RHN0Hu>QCoy^81)evCfJPl-MStAh~F zp{w$$(z1RSxM|?-Z=dAUzlHwOO^gl?0pY0*glGIH?b*w1w^5G#-zeYHXK7{*wPQF1 z=Ui3kbAsDK72NIus>a>eEV_dHZt$X~npi+IubV^;mh2I@g3A}QTiui-Xr|#jEszEwT+qD$A73_`FL}Ev3~7u zo1cE50o#KMjcRXQ<6DARe{a6N^2xB~C1InlJPl-zXSBwjuhkcp z8-JM#p+ecl?JsIyUkAWFAao+5)vhd^;Jnb#%*L>`#XEub3=Gc$-pvndD}N6oJ6+5< zu9S@KHRkj|EJZho@z8)|!Ji-smlX(L+Rc1+6{H%n$Q$4qzna5=|v zB_}iG8EiUx!N|KdJE9#jN@op9O{;Jy{i4PU*JD=G#OR22RQqX@7}k#J@lMq=2^7cn zSl$Sf&@yl$LIO5KRB6Jx!Sg_N%!R;ofqIxJ95~#{iph0SL*M623Z}} z0(sJO9q%!a6O|L@v+?;ijrCg(zdI zh}O9J<(-WUs6Z%TIFn&bSHd!>NSWj3#Kiy;@YF6}s@?hul)JKFLcBqaA#PCTafJ~K zR|o*71VnA|tF22*+xGLk%(fr?YbwJ#4)1sTb{3G2&L~yvV~PgR+hmH2qZ^pAVJ31Uf_t~ha;zkC#dtf&EA0NSIj;SH3zF-=L!@2w{@cZGn*+-2kOb9>9~2j zR34q+pxlE#RM{^|>gUs^oy^IiozE4WQCMtCKGbK%FD>ie_Js|!R>D8tL)Z`uXZV8G z;*VN70UOR?7~bU3_iRTG{S-0cqVOzc@oNa9DMB%PQxL|#62{;=LMj>-kUSSG2Ez@k zr(ENrn{C&b6CM_KCWxIZ7?yFd%tp&VI=nK-jDq4=y{BOLUS;UV&S6`{xbG5Z-JWHFjjV{M$rM@C z_cV8!ht}K+63i&s7YquXdzK4*J;#agjDuv;$>y1cS$~wf6sEqF+6eHR>o7!#D#=(S zDV%~)#VYQ@1F({@f{8>3fn9P>{B_CS!<>L3xhsDMt3LU^%H5%Ts-)tkG$x=YWR>Rp zgP>CKw;?w9(pEGalDC2