From 79c6cc992ca6aafff145ab2c15f6ad65343a9ffa Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 19 Aug 2025 09:45:41 +0800 Subject: [PATCH] 'commit' --- .../ElasticSearch/T6_SelectByVector.py | 173 ++---------------- .../ElasticSearch/Utils/EsSearchUtil.py | 164 ++++++++++++++++- .../__pycache__/EsSearchUtil.cpython-310.pyc | Bin 6528 -> 10747 bytes 3 files changed, 176 insertions(+), 161 deletions(-) diff --git a/dsSchoolBuddy/ElasticSearch/T6_SelectByVector.py b/dsSchoolBuddy/ElasticSearch/T6_SelectByVector.py index 9c4ca8ed..9a047397 100644 --- a/dsSchoolBuddy/ElasticSearch/T6_SelectByVector.py +++ b/dsSchoolBuddy/ElasticSearch/T6_SelectByVector.py @@ -1,175 +1,28 @@ -import json -import requests -from langchain_openai import OpenAIEmbeddings -from pydantic import SecretStr - from Config import Config from ElasticSearch.Utils.EsSearchUtil import EsSearchUtil -def get_query_embedding(query: str) -> list: - """ - 将查询文本转换为向量 - - 参数: - query: 查询文本 - - 返回: - list: 向量表示 - """ - # 创建嵌入模型 - embeddings = OpenAIEmbeddings( - model=Config.EMBED_MODEL_NAME, - base_url=Config.EMBED_BASE_URL, - api_key=SecretStr(Config.EMBED_API_KEY) - ) - - # 生成查询向量 - query_embedding = embeddings.embed_query(query) - return query_embedding - - -def search_by_vector(search_util: EsSearchUtil, query_embedding: list, k: int = 10) -> list: - """ - 在Elasticsearch中按向量搜索 - - 参数: - search_util: EsSearchUtil实例 - query_embedding: 查询向量 - k: 返回结果数量 - - 返回: - list: 搜索结果 - """ - # 从连接池获取连接 - conn = search_util.es_pool.get_connection() - - try: - # 构建向量查询DSL - query = { - "query": { - "script_score": { - "query": {"match_all": {}}, - "script": { - "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0", - "params": { - "query_vector": query_embedding - } - } - } - }, - "size": k - } - - # 执行查询 - response = conn.search(index=search_util.es_config['index_name'], body=query) - return response['hits']['hits'] - except Exception as e: - print(f"向量查询失败: {e}") - return [] - finally: - # 释放连接回连接池 - search_util.es_pool.release_connection(conn) - - -def rerank_results(query: str, results: list) -> list: - """ - 使用重排模型对结果进行排序 - - 参数: - query: 查询文本 - results: 初始搜索结果 - - 返回: - list: 重排后的结果 - """ - if len(results) <= 1: - # 结果太少,无需重排 - return [(result, 1.0) for result in results] - - # 准备重排请求数据 - rerank_data = { - "model": Config.RERANK_MODEL, - "query": query, - "documents": [result['_source']['user_input'] for result in results], - "top_n": len(results) - } - - # 调用重排API - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {Config.RERANK_BINDING_API_KEY}" - } - - try: - response = requests.post(Config.RERANK_BASE_URL, headers=headers, data=json.dumps(rerank_data)) - response.raise_for_status() - rerank_result = response.json() - - # 处理重排结果 - reranked_results = [] - if "results" in rerank_result: - for item in rerank_result["results"]: - doc_idx = item.get("index") - score = item.get("relevance_score", 0.0) - if 0 <= doc_idx < len(results): - reranked_results.append((results[doc_idx], score)) - else: - print("警告: 无法识别重排API响应格式") - reranked_results = [(result, 0.0) for result in results] - - return reranked_results - except Exception as e: - print(f"重排模型调用失败: {e}") - return [(result, 0.0) for result in results] - - -def display_results(results: list) -> None: - """ - 展示查询结果 - - 参数: - results: 查询结果列表,每个元素是(结果对象, 分数)的元组 - """ - if not results: - print("未找到相关数据。") - return - - print(f"找到 {len(results)} 条相关数据:") - for i, (result, score) in enumerate(results, 1): - source = result['_source'] - print(f"{i}. ID: {result['_id']}") - print(f" 相似度分数: {score:.4f}") - print(f" 内容: {source.get('user_input', '')}") - print(f" 标签: {source['tags']['tags'] if 'tags' in source and 'tags' in source['tags'] else '无'}") - print(f" 时间: {source['timestamp'] if 'timestamp' in source else '无'}") - print("-" * 50) - def main(): - # 创建EsSearchUtil实例(已封装连接池) + # 初始化搜索工具 search_util = EsSearchUtil(Config.ES_CONFIG) - # 获取用户输入 - query_text = input("请输入查询关键词(例如: 高性能的混凝土): ") - if not query_text: - query_text = "高性能的混凝土" - print(f"未输入查询关键词,使用默认值: {query_text}") + # 输入查询文本 + query = "混凝土" + print(f"查询文本: {query}") - # 生成查询向量 - print("正在生成查询向量...") - query_embedding = get_query_embedding(query_text) + # 获取查询向量 + query_embedding = search_util.get_query_embedding(query) + print(f"查询向量维度: {len(query_embedding)}") - # 执行向量搜索 - print("正在执行向量搜索...") - search_results = search_by_vector(search_util, query_embedding, k=10) + # 向量搜索 + search_results = search_util.search_by_vector(query_embedding, k=10) print(f"向量搜索结果数量: {len(search_results)}") - # 重排结果 - print("正在重排结果...") - reranked_results = rerank_results(query_text, search_results) + # 结果重排 + reranked_results = search_util.rerank_results(query, search_results) - # 展示结果 - display_results(reranked_results) + # 显示结果 + search_util.display_results(reranked_results) if __name__ == "__main__": diff --git a/dsSchoolBuddy/ElasticSearch/Utils/EsSearchUtil.py b/dsSchoolBuddy/ElasticSearch/Utils/EsSearchUtil.py index 34e43d60..c8a7473c 100644 --- a/dsSchoolBuddy/ElasticSearch/Utils/EsSearchUtil.py +++ b/dsSchoolBuddy/ElasticSearch/Utils/EsSearchUtil.py @@ -1,7 +1,11 @@ +import json import logging import warnings import hashlib import time + +import requests + from Config.Config import ES_CONFIG from ElasticSearch.Utils.ElasticsearchConnectionPool import ElasticsearchConnectionPool from langchain_core.documents import Document @@ -258,4 +262,162 @@ class EsSearchUtil: finally: # 确保释放连接回连接池 if 'conn' in locals() and 'search_util' in locals(): - search_util.es_pool.release_connection(conn) \ No newline at end of file + search_util.es_pool.release_connection(conn) + + def get_query_embedding(self, query: str) -> list: + """ + 将查询文本转换为向量 + + 参数: + query: 查询文本 + + 返回: + list: 向量表示 + """ + # 创建嵌入模型 + embeddings = OpenAIEmbeddings( + model=Config.EMBED_MODEL_NAME, + base_url=Config.EMBED_BASE_URL, + api_key=SecretStr(Config.EMBED_API_KEY) + ) + + # 生成查询向量 + query_embedding = embeddings.embed_query(query) + return query_embedding + + def search_by_vector(self, query_embedding: list, k: int = 10) -> list: + """ + 在Elasticsearch中按向量搜索 + + 参数: + query_embedding: 查询向量 + k: 返回结果数量 + + 返回: + list: 搜索结果 + """ + # 从连接池获取连接 + conn = self.es_pool.get_connection() + + try: + # 构建向量查询DSL + query = { + "query": { + "script_score": { + "query": {"match_all": {}}, + "script": { + "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0", + "params": { + "query_vector": query_embedding + } + } + } + }, + "size": k + } + + # 执行查询 + response = conn.search(index=self.es_config['index_name'], body=query) + return response['hits']['hits'] + except Exception as e: + logger.error(f"向量查询失败: {e}") + print(f"向量查询失败: {e}") + return [] + finally: + # 释放连接回连接池 + self.es_pool.release_connection(conn) + + def rerank_results(self, query: str, results: list) -> list: + """ + 使用重排模型对结果进行排序 + + 参数: + query: 查询文本 + results: 初始搜索结果 + + 返回: + list: 重排后的结果 + """ + if len(results) <= 1: + # 结果太少,无需重排 + return [(result, 1.0) for result in results] + + # 准备重排请求数据 + rerank_data = { + "model": Config.RERANK_MODEL, + "query": query, + "documents": [result['_source']['user_input'] for result in results], + "top_n": len(results) + } + + # 调用重排API + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {Config.RERANK_BINDING_API_KEY}" + } + + try: + response = requests.post(Config.RERANK_BASE_URL, headers=headers, data=json.dumps(rerank_data)) + response.raise_for_status() + rerank_result = response.json() + + # 检查响应结构 + if 'results' not in rerank_result: + logger.error(f"重排API响应结构不正确,缺少'results'字段: {rerank_result}") + print(f"重排API响应结构不正确,缺少'results'字段") + return [(result, 1.0) for result in results] + + # 构建重排后的结果列表 + reranked_pairs = [] + for item in rerank_result['results']: + # 尝试获取文档索引,优先使用'index'字段,其次是'document'字段 + doc_idx = item.get('index', item.get('document', -1)) + if doc_idx == -1: + logger.error(f"重排结果项缺少有效索引字段: {item}") + print(f"重排结果项结构不正确") + continue + + # 尝试获取分数,优先使用'relevance_score'字段,其次是'score'字段 + score = item.get('relevance_score', item.get('score', 1.0)) + + # 检查索引是否有效 + if 0 <= doc_idx < len(results): + reranked_pairs.append((results[doc_idx], score)) + else: + logger.error(f"文档索引{doc_idx}超出范围") + print(f"文档索引超出范围") + + # 如果没有有效的重排结果,返回原始结果 + if not reranked_pairs: + logger.warning("没有有效的重排结果,返回原始结果") + return [(result, 1.0) for result in results] + + # 按分数降序排序 + reranked_pairs.sort(key=lambda x: x[1], reverse=True) + return reranked_pairs + except Exception as e: + logger.error(f"重排失败: {str(e)}") + print(f"重排失败: {e}") + # 重排失败时返回原始结果 + return [(result, 1.0) for result in results] + + def display_results(self, results: list, show_score: bool = True) -> None: + """ + 展示搜索结果 + + 参数: + results: 搜索结果列表 + show_score: 是否显示分数 + """ + if not results: + print("没有找到匹配的结果。") + return + + print(f"找到 {len(results)} 条结果:\n") + for i, (result, score) in enumerate(results, 1): + print(f"结果 {i}:") + print(f"内容: {result['_source']['user_input']}") + if show_score: + print(f"分数: {score:.4f}") + print("---") + diff --git a/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/EsSearchUtil.cpython-310.pyc b/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/EsSearchUtil.cpython-310.pyc index 8db14693b524dc2650cee536308fc27a7ab2c291..6dd2f28bda0ca8b1762c741fc2c5e8870e5ba12a 100644 GIT binary patch delta 4596 zcma)9Yj7LY72bPywR+3)Gj?KvC^SZd1lj^aO(rD}LqmdTDTvgHC*s}F-?yeH7 zuA&s1Jcbxzln|OY!Qz=v9-#>>aRZt1qr<=c=}c$TKb>hL+rQcw`YXJ8&RxlgOw+PE zd+yzP&%O8DyWe-t*}Xsbyf2XU`#lm|_x$zM!Tau;40MvWCs(e>lZ2uvOkyPA&>T?G zgqn0}P9i~FPPme8%`NIm!jtrBUQu@>d`Z9N7j-oeNH%Bt)+jKItq)3SLu*BTVGcKvjGe))c$v^$3_aYdlOB!L4Mj2srT+(DF zGX+Y8tzas1juSAkOAI8M%5G(T-N^#Fi#5O#)ZIH=tdTX1J2Vdq7cP+{{q4|??F#E) zOUD(>5A!V3-K?{!ud*B1a?lSj)VvXTP+Mgob`xkfu-io2&AOKbSu?cX!oCaIQ16q% zE6@FnC^Bg~_je_%klW9B+#eAVJ@={S4dN@glTkA^7>OnlX9`(go*3t@_ySbzg)Q>O z3W0_+*<9Guu#%iEyw$L~*4!1`NM1N+1Rqq$nZoZ|jHV}1eIDkw27Ut&tuu|B`RVcU% z#SRn$D4Zw;L1Z&Kj>%BgE;JR%CQkYj zs94gFG);<9PR?%0h7sNQxD+P{Z;L?bn4(ObqH$tT=A5EA#gd0843e3vL$Va+o}f90 zldr0jB2opv&>rJj&ZQtUTfwYM>N8-USjOxh+f zUryblE}qJ6-A#7aEwNh)`!j#UeN|x)Do>s6Nkk1Z9y7!VUHR))T} z`cBPvtwxygpTcw{)gB%0r2<_g)n1W50-ZdF;to_(2IHoYX@P2W?}8lV*WRBy|9W?* zH|*xEsQY~sJs=80o~8MFpaM7-&_x9CT`1O|5J=vLwY4Y~p*LtBc4Yf##NJN`k-Hi+ zJwAm}MI|68{aX1-`HFr`z3RVPf&mh62Cf82fWRZW_R=f)fSSFb4%&;SEjhVvWD=%P z2W%g%1CIdqUtncZvM9QZVZ!~4GI<={*GprhPq-F~Fy(4nRo5}4a!v;8rV|Y4O>>f3(GkaWS#7WafvNwH%`D3mJ@de)NdS}2kL_l1M1_M zgv=((J4UYALdf#sn>d}Dn7p<2oxl^IBlu3vyT{vCEs^@+n=_x{w+32@>+(mJW{;en z-#<}#?uE+fV)@jw<@YYwl$^VCVy<`y+RHP~Bjf#oOEsPsip|Z-s!-|!~ksz7lt(qXMON|hSVDXXVtIf2p~?{K_}@>+9#bM zv(N+AzkVcGbcXUR@WvlPQBN!Wvph6zg(W`2savG%9d+yg4--udN2>c+y(W>4MH9wa zsKgC3mQIeWy{>r%a$K0_vdse*PC6vp#4s&6BSt#IV|kt5zF-#`^9>83tsl9L2DVEO z_kNCA=q)riOUE)vJ!Kl2!%UAvQrTd1WF!%fMa_6Rb;ppAPGy}BfS1;JC`_}y2h%AN zv{!8z9nnkvbs2Loo#*ka&_M?bd-#J!!_k-?G2EEb5q< zsjCN|HE=zl?~0~kx*e5D4m&vQ!PUVGDTDqK)28C9_Uxiz?sw0W_vdHk4vm#hoX;R! zu2m7ewaz=S^S4Db(6;gT_|+|(nu1SYN6_+fF9XkP8hyJHgyZAaJHmstBdnVWH-uP3 z4jP!NSpwLYF!l<@itiF8CGylFESFqs64BF~M?CJ}&k<)!SZZ%H|O=oSE~Jgkpm zpxOms)iapB*{4wpzIFDZtu>$VP4M9U7yASl@Ir0tz2RoQ6&gfVTJmk~*}SfIW8~4t z*7s~GwN>99*wDLvL+`^8U`S+R&$g0_>);Cw6NogDHq25(O>bRaPvr5~~14N1|~gf85lQniE`0B+j1V`*7fY%Sfu@ zYR>Ty{G^*f+2;*VMWMt;RHA-C-5#QTMIG&sTf;2}K?dOm%12r#*77bwNsHVmw~!|4 zfesHeHpwa^EiLdYrI5CKkzMvbAhM<8F$UASZR*u!E%-OF@Z<9|yja9=nSfRi4k1!N zJujOOJTPW5<)n-qCWT<)7=v`Czw1ewli^Wv4p3Ds$Xb}QswFWOCg^rzUW>67<=vp^ zg_PZQLb2m6y!%C*U&3I?a3FI4ELjM!P38BG!k_vX3#u7(V^t4w@Z<7|?vJlk0Zn?-}egx?QCq4+Ttls^iEaMxkd>j;xA;l{#+C3oGu37e5&B{`ll zOA3a5z75}%?df!a8`z&gkwx($ieDE3?VUZC{|ob65+ijhX za5waaJNRKJO3p-jU;zJG8*%&)iWrKCLVx=wZ=b2~Td3pbK_AdFU=eb0#y`zu) z_ZOyq6FyAa7k4#gBqlY1AVHlK5{p{7H<+C`)XBZc+@wg0xi7F%;!!X67W0!bE$7~5 zK@!jaNKE!CaX157jjQ`7V)D2)10XjEI_ORay00}_N9d5K_(x7N{W)7lsWG8|glY(= zO$gOdM?bY4oS4+b)8acc%&P3)%vLl* zhcH_;BLi9)cfJB_W=q9GC{ZWb8t?SGTZ~2GS7{g0Y}@m-!fiPUzayTPQ@AUBm)me4 z!r;Ym{5DvFH`$M1#ehgG)%