From a377d0271961abefc6bb9ef3d9c62dd632716084 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 30 Jun 2025 07:49:41 +0800 Subject: [PATCH 1/8] 'commit' --- dsRag/Start.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/dsRag/Start.py b/dsRag/Start.py index 478b45dc..6394f36c 100644 --- a/dsRag/Start.py +++ b/dsRag/Start.py @@ -42,6 +42,11 @@ console_handler.setFormatter(logging.Formatter( logger.addHandler(file_handler) logger.addHandler(console_handler) +# 初始化异步 OpenAI 客户端 +client = AsyncOpenAI( + api_key=Config.MODEL_API_KEY, + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", +) async def lifespan(app: FastAPI): # 抑制HTTPS相关警告 @@ -135,18 +140,13 @@ async def rag(request: fastapi.Request): 1. 严格保持原文中图片与上下文的顺序关系,确保语义相关性 2. 图片引用使用Markdown格式: ![图片描述](图片路径) 3. 使用Markdown格式返回,包含适当的标题、列表和代码块 - 4. 对于提供Latex公式的内容,尽量保留Latex公式 - 5. 直接返回Markdown内容,不要包含额外解释或说明 - 6. 依托给定的资料,快速准确地回答问题,可以添加一些额外的信息,但请勿重复内容 - 7. 如果未提供相关信息,请不要回答 - 8. 如果发现相关信息与原来的问题契合度低,也不要回答 - 9. 确保内容结构清晰,便于前端展示 + 4. 直接返回Markdown内容,不要包含额外解释或说明 + 5. 依托给定的资料,快速准确地回答问题,可以添加一些额外的信息,但请勿重复内容 + 6. 如果未提供相关信息,请不要回答 + 7. 如果发现相关信息与原来的问题契合度低,也不要回答 + 8. 确保内容结构清晰,便于前端展示 """ - # 初始化异步 OpenAI 客户端 - client = AsyncOpenAI( - api_key=Config.MODEL_API_KEY, - base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", - ) + async def generate_response_stream(): try: # 流式调用大模型 From cdb342172e977b46228abf3512b736f8fefd6b50 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 30 Jun 2025 07:51:20 +0800 Subject: [PATCH 2/8] 'commit' --- dsRag/Start.py | 2 +- dsRag/Util/EsSearchUtil.py | 124 +++++++++++++++++- dsRag/Util/SearchUtil.py | 124 ------------------ .../__pycache__/EsSearchUtil.cpython-310.pyc | Bin 3967 -> 6631 bytes .../__pycache__/SearchUtil.cpython-310.pyc | Bin 3066 -> 0 bytes 5 files changed, 123 insertions(+), 127 deletions(-) delete mode 100644 dsRag/Util/SearchUtil.py delete mode 100644 dsRag/Util/__pycache__/SearchUtil.cpython-310.pyc diff --git a/dsRag/Start.py b/dsRag/Start.py index 6394f36c..13b8d172 100644 --- a/dsRag/Start.py +++ b/dsRag/Start.py @@ -18,7 +18,7 @@ from starlette.responses import StreamingResponse from starlette.staticfiles import StaticFiles from Config import Config -from Util.SearchUtil import * +from Util.EsSearchUtil import * # 初始化日志 logger = logging.getLogger(__name__) diff --git a/dsRag/Util/EsSearchUtil.py b/dsRag/Util/EsSearchUtil.py index 8968bfb4..21fe14e5 100644 --- a/dsRag/Util/EsSearchUtil.py +++ b/dsRag/Util/EsSearchUtil.py @@ -4,7 +4,7 @@ from logging.handlers import RotatingFileHandler import jieba from gensim.models import KeyedVectors -from Config.Config import MODEL_LIMIT, MODEL_PATH +from Config.Config import MODEL_LIMIT, MODEL_PATH, ES_CONFIG from ElasticSearch.Utils.ElasticsearchConnectionPool import ElasticsearchConnectionPool # 初始化日志 @@ -125,4 +125,124 @@ class EsSearchUtil: elif search_type == 'text': return self.text_search(query, size) else: - return self.hybrid_search(query, size) \ No newline at end of file + return self.hybrid_search(query, size) + + def queryByEs(query, query_tags, logger): + # 获取EsSearchUtil实例 + es_search_util = EsSearchUtil(ES_CONFIG) + + # 执行混合搜索 + es_conn = es_search_util.es_pool.get_connection() + try: + # 向量搜索 + logger.info(f"\n=== 开始执行查询 ===") + logger.info(f"原始查询文本: {query}") + logger.info(f"查询标签: {query_tags}") + + logger.info("\n=== 向量搜索阶段 ===") + logger.info("1. 文本分词和向量化处理中...") + query_embedding = es_search_util.text_to_embedding(query) + logger.info(f"2. 生成的查询向量维度: {len(query_embedding)}") + logger.info(f"3. 前3维向量值: {query_embedding[:3]}") + + logger.info("4. 正在执行Elasticsearch向量搜索...") + vector_results = es_conn.search( + index=ES_CONFIG['index_name'], + body={ + "query": { + "script_score": { + "query": { + "bool": { + "should": [ + { + "terms": { + "tags.tags": query_tags + } + } + ], + "minimum_should_match": 1 + } + }, + "script": { + "source": "double score = cosineSimilarity(params.query_vector, 'embedding'); return score >= 0 ? score : 0", + "params": {"query_vector": query_embedding} + } + } + }, + "size": 3 + } + ) + logger.info(f"5. 向量搜索结果数量: {len(vector_results['hits']['hits'])}") + + # 文本精确搜索 + logger.info("\n=== 文本精确搜索阶段 ===") + logger.info("1. 正在执行Elasticsearch文本精确搜索...") + text_results = es_conn.search( + index=ES_CONFIG['index_name'], + body={ + "query": { + "bool": { + "must": [ + { + "match": { + "user_input": query + } + }, + { + "terms": { + "tags.tags": query_tags + } + } + ] + } + }, + "size": 3 + } + ) + logger.info(f"2. 文本搜索结果数量: {len(text_results['hits']['hits'])}") + + # 合并结果 + logger.info("\n=== 最终搜索结果 ===") + logger.info(f"向量搜索结果: {len(vector_results['hits']['hits'])}条") + for i, hit in enumerate(vector_results['hits']['hits'], 1): + logger.info(f" {i}. 文档ID: {hit['_id']}, 相似度分数: {hit['_score']:.2f}") + logger.info(f" 内容: {hit['_source']['user_input']}") + + logger.info("文本精确搜索结果:") + for i, hit in enumerate(text_results['hits']['hits']): + logger.info(f" {i + 1}. 文档ID: {hit['_id']}, 匹配分数: {hit['_score']:.2f}") + logger.info(f" 内容: {hit['_source']['user_input']}") + + # 去重处理:去除vector_results和text_results中重复的user_input + vector_sources = [hit['_source'] for hit in vector_results['hits']['hits']] + text_sources = [hit['_source'] for hit in text_results['hits']['hits']] + + # 构建去重后的结果 + unique_text_sources = [] + text_user_inputs = set() + + # 先处理text_results,保留所有 + for source in text_sources: + text_user_inputs.add(source['user_input']) + unique_text_sources.append(source) + + # 处理vector_results,只保留不在text_results中的 + unique_vector_sources = [] + for source in vector_sources: + if source['user_input'] not in text_user_inputs: + unique_vector_sources.append(source) + + # 计算优化掉的记录数量和节约的tokens + removed_count = len(vector_sources) - len(unique_vector_sources) + saved_tokens = sum(len(source['user_input']) for source in vector_sources + if source['user_input'] in text_user_inputs) + + logger.info(f"优化掉 {removed_count} 条重复记录,节约约 {saved_tokens} tokens") + + search_results = { + "vector_results": unique_vector_sources, + "text_results": unique_text_sources + } + return search_results + finally: + es_search_util.es_pool.release_connection(es_conn) \ No newline at end of file diff --git a/dsRag/Util/SearchUtil.py b/dsRag/Util/SearchUtil.py deleted file mode 100644 index 1c47d2b2..00000000 --- a/dsRag/Util/SearchUtil.py +++ /dev/null @@ -1,124 +0,0 @@ -from Config.Config import ES_CONFIG -from Util.EsSearchUtil import EsSearchUtil - - -def queryByEs(query, query_tags,logger): - # 获取EsSearchUtil实例 - es_search_util = EsSearchUtil(ES_CONFIG) - - # 执行混合搜索 - es_conn = es_search_util.es_pool.get_connection() - try: - # 向量搜索 - logger.info(f"\n=== 开始执行查询 ===") - logger.info(f"原始查询文本: {query}") - logger.info(f"查询标签: {query_tags}") - - logger.info("\n=== 向量搜索阶段 ===") - logger.info("1. 文本分词和向量化处理中...") - query_embedding = es_search_util.text_to_embedding(query) - logger.info(f"2. 生成的查询向量维度: {len(query_embedding)}") - logger.info(f"3. 前3维向量值: {query_embedding[:3]}") - - logger.info("4. 正在执行Elasticsearch向量搜索...") - vector_results = es_conn.search( - index=ES_CONFIG['index_name'], - body={ - "query": { - "script_score": { - "query": { - "bool": { - "should": [ - { - "terms": { - "tags.tags": query_tags - } - } - ], - "minimum_should_match": 1 - } - }, - "script": { - "source": "double score = cosineSimilarity(params.query_vector, 'embedding'); return score >= 0 ? score : 0", - "params": {"query_vector": query_embedding} - } - } - }, - "size": 3 - } - ) - logger.info(f"5. 向量搜索结果数量: {len(vector_results['hits']['hits'])}") - - # 文本精确搜索 - logger.info("\n=== 文本精确搜索阶段 ===") - logger.info("1. 正在执行Elasticsearch文本精确搜索...") - text_results = es_conn.search( - index=ES_CONFIG['index_name'], - body={ - "query": { - "bool": { - "must": [ - { - "match": { - "user_input": query - } - }, - { - "terms": { - "tags.tags": query_tags - } - } - ] - } - }, - "size": 3 - } - ) - logger.info(f"2. 文本搜索结果数量: {len(text_results['hits']['hits'])}") - - # 合并结果 - logger.info("\n=== 最终搜索结果 ===") - logger.info(f"向量搜索结果: {len(vector_results['hits']['hits'])}条") - for i, hit in enumerate(vector_results['hits']['hits'], 1): - logger.info(f" {i}. 文档ID: {hit['_id']}, 相似度分数: {hit['_score']:.2f}") - logger.info(f" 内容: {hit['_source']['user_input']}") - - logger.info("文本精确搜索结果:") - for i, hit in enumerate(text_results['hits']['hits']): - logger.info(f" {i + 1}. 文档ID: {hit['_id']}, 匹配分数: {hit['_score']:.2f}") - logger.info(f" 内容: {hit['_source']['user_input']}") - - # 去重处理:去除vector_results和text_results中重复的user_input - vector_sources = [hit['_source'] for hit in vector_results['hits']['hits']] - text_sources = [hit['_source'] for hit in text_results['hits']['hits']] - - # 构建去重后的结果 - unique_text_sources = [] - text_user_inputs = set() - - # 先处理text_results,保留所有 - for source in text_sources: - text_user_inputs.add(source['user_input']) - unique_text_sources.append(source) - - # 处理vector_results,只保留不在text_results中的 - unique_vector_sources = [] - for source in vector_sources: - if source['user_input'] not in text_user_inputs: - unique_vector_sources.append(source) - - # 计算优化掉的记录数量和节约的tokens - removed_count = len(vector_sources) - len(unique_vector_sources) - saved_tokens = sum(len(source['user_input']) for source in vector_sources - if source['user_input'] in text_user_inputs) - - logger.info(f"优化掉 {removed_count} 条重复记录,节约约 {saved_tokens} tokens") - - search_results = { - "vector_results": unique_vector_sources, - "text_results": unique_text_sources - } - return search_results - finally: - es_search_util.es_pool.release_connection(es_conn) - diff --git a/dsRag/Util/__pycache__/EsSearchUtil.cpython-310.pyc b/dsRag/Util/__pycache__/EsSearchUtil.cpython-310.pyc index 8d10dd8681357ae8e44821f8f98470b8fb12cf1c..5b37225ac2c0070aeda27e896b2c3ed68961e042 100644 GIT binary patch literal 6631 zcmb7JS#TW3d7hq`y#~7r5C9KRlsvW$a4Bjl!7naVB<1_N z_XMQGS#Wx~d;adf|GWR6)l|yT@XP;e{=};fYubO%p!*lW;2AvL4K!Titfb}Co8=hB zdPy%EIfJP=GiPGXC`HQATvX1PrC2$hilPVi+4vxj@)(cL>KGY3!7bj$le0!{$Q@c3#;Wx^wa6rF|H1}$hfUd9Of zzI9~Z{zJ~8gWo#%QfR5c3*UI@`7p8nsI<R6Ss!>W7tm-3!pEO>5S6i)1|Rw~$_ zSgpKJt(HRbP<70E2~ctOOwse5>dAVVrXKVByzsN7>R3_J2DBC8Yeo;_<@}jvCw&pfm>?_txjoAK;N?T5CO+|zEUl5da8uIJ^)@U=iBcK-|vp26cC zLgQ*Aq@2zf*UxD=gB#q$+vJkL=U6U+UYsY;i=qcMqZi{tJjK)K#d$v;0DOY413QPq zLNau~g83p!FlHF+6+mcpsn%SfDwY&jQBTJs5GF+6A3Xu&DD0kItHaFi6ZL zvmBX%r~~~=pcAtOV55OHj>n0$-hO;M;alAF`?w_`l74C-&HL~@Io^L+yP}g~^zO<{ z-r9}vB+jHc13U%l(zRdX*p|KO*OjluOKa#s7sp2Jp0z9IXO};^(YXG%jq^7@zI~~2>3n1E?@v@c-`j52JXesEx7!nW z&--3g@a=Z7!re0t%`RWRxCU7mkxN7CSl%lZoP5nc5hhQ&qIi7LDY(M-4y^11T^UoP z@e=*i44?`7p?P$`rt*4%5f}?oJ%`O0tJXnhqsNMsyqF9lrDD10*Wfj@rC%;K-uhwV z^@Y11eY7+)*LeLZD7*Ws*~Z+DS1$Z$`POe5zk6@g4y_I?hcoNKM4Lr|bSE^%TtAFq zasrl=OS#=pwJ5Rd>=Xh0=^S&VK5+<#Pl&jn=g;5yZ zm@7iFSUFw|`v3})cf4ZV6&nBs+c!P8bX;ttg@o#wCccW-C)s_YFLUp^RdEup7esX& zE75x~KlU;i!OPtik)4u6Eb3mXQLT|y}Xi~0y1o; zoz{d-?*JD5WBrtVRPijs#1_CpQ}pbP&?MPlik3^3im#y;TDwYckA-S^V$ZV}dxXl? z>N|U=_uVtT<5wNGe9Yw>?mN4yR4wF7-X3&X%g3-%{?;6wv_Kx6jw6q5O)($Gon}4= zm}d9(99s_!uT~C?l3U5>;z6tl4PGq6CSpooyw3RrX+5!C9()Tp7e>a5?y-E%EEQ_L zWPtdJ+@fBbfc4qbH;JGyLLBjOR{r#uqc)cN0w)29ZPZ8}ty@F?9(?nXXe?upnR=RS zFicjz|9@xN3Ptz=DNE}|jG<>Q9*_16v<0vQ3S^M-6tyBFKOGo&N9J`No!5(O3jTVW zjq7OPwc$CHBoTGsM;Ab5eb10@{H2ruNh%PdL!{ypdZ!`R{$|RhJm=3W}+&*OOKF=DA5q8 z*hOFa70&i4b1nMiywVf$A4-s^mcNm~^P8CWHlu+@(4+D(Kf~YWQ7aA~0sru6eRGdU zdzSY|lBC*}q>TZj11X_J%)bZMGj0T|h(E~6l(LE(mh{6hl88c!LJnNk1eA6Z>*4T; ze6gDv@`jkOso$o3HF14XOBJ_X0i;Lj7G-hi@IcC@9#Xo1dk7&S$kj-dB<)u!8R&~T ze3SHVvJa&UOwQ)vY%+T74*+b3#HEYx-(9@4^x=n%xfvCpKfXPqB9IzBvh?PKrTKqo zed`3%X2h1V8@1KNi7 zj@l}^-?{nD^82%DUj@JXcjwd%puGfYnr_NjxL&Q~dtp*Ct2u&u_&TT+Asfu;m>X_NPR(fxxZnk{pD}9xj-_qtmv3FYbMtJI^KZP{_|e<#PycV>NF#In z$qC4)$@5>|!u15&Frqj-0tZ999kG{Y-n2&)ks}}9UZW9zzM#8>+dg_Os_=9MRq;xis{72d;W zeO${`#2?oej74qAoMZLx2l@i+N?n-IgPZSMpfQFU=^2}C$cXG23z0xW0Ex(bqJA`p zO!uyc1rg+i@$*OodUi_4_2ziYXWW9zLZ(OQhjLqEj8YH^KYSe@3?g_B&Fg$v&cyk8 z^f%1vQ&EX^Bd|vKChUP{7501D=iR&tUy|R;xA6M{1U>BD!3>-jcW}fcj{AWl_9ZyJ z(#B!0J_-No6@$L|@y-b;Q{AV%^1#dbRD3EScmG-tzpU{Gf&_n%xWpeq?_tn0a@o8R zZSTGnvl-0Z*Wu);KgjjlnmD>n2n-K(VR$4!Qn!)z?qJ#eXJh#jf0Sp>X&oBm-O=KY z$=NnmIomg;O(p%rw02Po`f!T=snm?NUjNCNp2!2sGPuP8lD z2TA_aygs!qSod8mFkk_xz@#(4XLXIfr)B=)aE&6NwR`t&yK(z0vcP75k%7U+bdB!# zjW@2+r>&YECU&Sib&yOm+5>Ig#=gD?k6|r5pbw_Zq>H$FsHyIE|UtD0}+J zB^ADqem34Yw>qOccNM=@!W73iBP##cGTFB_YFbGsCa@J3LWAv<(kgr0#WX1Oc4$~S-TJ2r$9m% z$GXEUGD`DA(f4W;?xdU)_m=zuLS66^;NiWTxUB1=4il}pUv}O%SSV(eNy0Ta|P=lz^EnlHoeEZ-&kZL$ZUK_&f zc6<5q%{#YmBd3O*0PU!qi>ixv-N^1dF0~I^;MaKVeB;J%u{h?aOL6TU!Xd9g-CRw^035~hM=}AKT16=<4e3DV@S}U7^uvUjP z%A@{`HlhS@SgUXA31{*~_Uz<-?~4+@FFllafiV9yHS!|#V!J1Ybi0)KTS%Rk(0t)e zq@>BC{l-QQk^W$EnoHuE_f?GH8@=j6COIG(R!nR=6O)mKzjq$<2Fi*G`#1m3dhz zlOhiy$lh~l)OVVfp&BYIDjlIDwR6c0Sy7PofLUerY9E>RXr+Zn=8e(rSgZ++ zJh2HKZN5K|D+kPv0lC34?q{3X!+0!x6WfOO24e#=QF)3RX_iJWjuyL}9l-o1LtXw& zbBkWz@Wt9idb|TvK#!lpOZ6=(a8WJNaYD;+P+Y5(sGoG4Q?SzQ#b5^_@FFb+F7L^b=+Jb3uP5vuc_ba_#reAzrIDkiRfK$a^JNy9{^ vYr%Y)|3{&$vBM_Wlj;`VrFg%G0i8z$E@GI<3Wjxf4kO9ZC=fyO*r5JD;i-N$ delta 1034 zcmZ8fzi-n}5cae4(}|tbr6DQ(VHag^OG8T$q99QUm5`u92s$7$L}}h5brmP!*#X+6 z16=@hC~snG$F>uSSef}7EIK2mZme+UphB_a^WAs%eV^~?eV_X^rnfasmEbvkR$W^j z*wshK;qKJp7Llk#<4x7eI2j@_X)nvPed6d$-^(#+J-<%?CF=t;L6bDKBSWRoG*xMa zW_J|FV8-4cu+C9nMYH3w;iCJ<;mc%zQGMz`SZQ{gfo~CFTvIJz{2w9R2P>t%4rDzTmjo!5Ug=934W402;eAax<3&=4b97x>kjmGCl*~C8K7w{v8a%Cor zy=rp`&^SOG+uRb}#D_UdVMOt}UZ^x#%a-{?fI@}39_*NW$?8Fy`+q3ND~lV+F$1Of zSTwomcI+4*7wgGU!vq8*A{QG+Eb%enu!kYP{z8fW%*P8Q*|L zox7cY<7Nj+7)6djidE*VGD_W6 z!^!rw#iMjGW8zp8EU}sv_t7tmd$pjx7OGvJ@v7VEbOU}3a9fG&_&jp(lmUn~JN4=- znb@~_3+e+xZ1FxbF>@1e)5-J~ zuLc_(7A^a)2u{t5BW%+0S)2ZnCU*Y@3(G8oL3n+< zn<*(pQj@Bt8fsF{MLqkx_*5*HCZX$S4c78qui~{SYx-OPEEFfjqR5u66cSja8!T9E pHyVudyHN8Hu~J$dLXUrZO)M?(xnvr7IZIR}OLSQ$d7=_S{tXsA+a>@2 diff --git a/dsRag/Util/__pycache__/SearchUtil.cpython-310.pyc b/dsRag/Util/__pycache__/SearchUtil.cpython-310.pyc deleted file mode 100644 index 006f496a706bf6e5625f7b1bf16d1c4062ec5e4c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3066 zcma)8-ESMm5#QZAp2!nPQMN2gj_h2%Y%_5fevL&}ts-#U2)KV=asZ#^769j3I#A;sqKiq3gQbHf{ke472c4zk}lCoo< zC*AJO?9BXTc4miXClV@xXKW-l`D`CT|KUXELxA`hyv~y_P(*PNm9UO6Uy~v!NxB4U ziOO?GmuX}irIjs^L}?tvPLF3_dS&bvFaHvV_-SX{$g%w7tFBqhOI)P+?1Gv5z*8NMQstDA*fU3{|umSIV=z}eP_uYpVr;pGfy5C0}uLBt{fo@}T35)}P z(e)6FM_U-${hrXrRwcgcM%y0Jo>_7Voq1w{RHN0Hu>QCoy^81)evCfJPl-MStAh~F zp{w$$(z1RSxM|?-Z=dAUzlHwOO^gl?0pY0*glGIH?b*w1w^5G#-zeYHXK7{*wPQF1 z=Ui3kbAsDK72NIus>a>eEV_dHZt$X~npi+IubV^;mh2I@g3A}QTiui-Xr|#jEszEwT+qD$A73_`FL}Ev3~7u zo1cE50o#KMjcRXQ<6DARe{a6N^2xB~C1InlJPl-zXSBwjuhkcp z8-JM#p+ecl?JsIyUkAWFAao+5)vhd^;Jnb#%*L>`#XEub3=Gc$-pvndD}N6oJ6+5< zu9S@KHRkj|EJZho@z8)|!Ji-smlX(L+Rc1+6{H%n$Q$4qzna5=|v zB_}iG8EiUx!N|KdJE9#jN@op9O{;Jy{i4PU*JD=G#OR22RQqX@7}k#J@lMq=2^7cn zSl$Sf&@yl$LIO5KRB6Jx!Sg_N%!R;ofqIxJ95~#{iph0SL*M623Z}} z0(sJO9q%!a6O|L@v+?;ijrCg(zdI zh}O9J<(-WUs6Z%TIFn&bSHd!>NSWj3#Kiy;@YF6}s@?hul)JKFLcBqaA#PCTafJ~K zR|o*71VnA|tF22*+xGLk%(fr?YbwJ#4)1sTb{3G2&L~yvV~PgR+hmH2qZ^pAVJ31Uf_t~ha;zkC#dtf&EA0NSIj;SH3zF-=L!@2w{@cZGn*+-2kOb9>9~2j zR34q+pxlE#RM{^|>gUs^oy^IiozE4WQCMtCKGbK%FD>ie_Js|!R>D8tL)Z`uXZV8G z;*VN70UOR?7~bU3_iRTG{S-0cqVOzc@oNa9DMB%PQxL|#62{;=LMj>-kUSSG2Ez@k zr(ENrn{C&b6CM_KCWxIZ7?yFd%tp&VI=nK-jDq4=y{BOLUS;UV&S6`{xbG5Z-JWHFjjV{M$rM@C z_cV8!ht}K+63i&s7YquXdzK4*J;#agjDuv;$>y1cS$~wf6sEqF+6eHR>o7!#D#=(S zDV%~)#VYQ@1F({@f{8>3fn9P>{B_CS!<>L3xhsDMt3LU^%H5%Ts-)tkG$x=YWR>Rp zgP>CKw;?w9(pEGalDC2 Date: Mon, 30 Jun 2025 07:51:39 +0800 Subject: [PATCH 3/8] 'commit' --- dsRag/Start.py | 3 --- dsRag/Util/ModelUtil.py | 4 ---- 2 files changed, 7 deletions(-) delete mode 100644 dsRag/Util/ModelUtil.py diff --git a/dsRag/Start.py b/dsRag/Start.py index 13b8d172..587a86f4 100644 --- a/dsRag/Start.py +++ b/dsRag/Start.py @@ -1,13 +1,10 @@ import json -import logging -import os import subprocess import tempfile import urllib.parse import uuid import warnings from io import BytesIO -from logging.handlers import RotatingFileHandler import fastapi import uvicorn diff --git a/dsRag/Util/ModelUtil.py b/dsRag/Util/ModelUtil.py deleted file mode 100644 index 2d7907ef..00000000 --- a/dsRag/Util/ModelUtil.py +++ /dev/null @@ -1,4 +0,0 @@ -from typing import List - -from pydantic import BaseModel, Field - From 4578435c4099df4a8cac54856f6fc14b37553757 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 30 Jun 2025 07:52:11 +0800 Subject: [PATCH 4/8] 'commit' --- dsRag/Util/SplitDocxUtil.py | 13 ------------- .../__pycache__/SplitDocxUtil.cpython-310.pyc | Bin 854 -> 0 bytes 2 files changed, 13 deletions(-) delete mode 100644 dsRag/Util/SplitDocxUtil.py delete mode 100644 dsRag/Util/__pycache__/SplitDocxUtil.cpython-310.pyc diff --git a/dsRag/Util/SplitDocxUtil.py b/dsRag/Util/SplitDocxUtil.py deleted file mode 100644 index 5774b3fc..00000000 --- a/dsRag/Util/SplitDocxUtil.py +++ /dev/null @@ -1,13 +0,0 @@ -import docx - - -class SplitDocxUtil: - @staticmethod - def read_docx(file_path): - """读取docx文件内容""" - try: - doc = docx.Document(file_path) - return "\n".join([para.text for para in doc.paragraphs if para.text]) - except Exception as e: - print(f"读取docx文件出错: {str(e)}") - return "" \ No newline at end of file diff --git a/dsRag/Util/__pycache__/SplitDocxUtil.cpython-310.pyc b/dsRag/Util/__pycache__/SplitDocxUtil.cpython-310.pyc deleted file mode 100644 index 9836806cb859d22219a444a378e839496cbf5e9c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 854 zcmZ`%&x_MQ6n-;FyJ^_U+JhEPUW2%LUP=+Hcq?L%VyJ|WiEV1yq%%{tEv*P$-J=My zAR_D@z35HQ{ZHnq75Csj;6Z%TU2BW_;C*?Mc|X4QC26r}16u!kvppgJpY>tY3_RRG z7ncwiFp_`=w}DY+>;W@o)FX_z;Fu?P5A{&vsx9lw_%;H}29HJtBaH6BjCdw9bhRqF zSrW@y8V>KtI0=!;(sT<=@+P`yB8<;VI3yD|B2yfGNKeU3NXMvQ%OlbN7fg`m=qiCJ zB{1HSIcCcIgwDuoc!Nlw0r!FBrt~podG{>ZCA;8k3@+e^Z|{%i&kq>V`FgPb<@m$= z+0*&EkBZo9%qV7Rb}qWm3$|ba53DIcF2JeA7qm}Zcnc0uri_Ntp^9!__slHd0XMN^ z0Y9$QcA88yQvL`ZcX--~LW$Acpw-lRG}qMG&PK|vB(acT+Rdt38=??4)tevo-k^53;sU zg Date: Mon, 30 Jun 2025 07:52:25 +0800 Subject: [PATCH 5/8] 'commit' --- dsRag/Util/PdfUtil.py | 34 ------------------ .../Util/__pycache__/PdfUtil.cpython-310.pyc | Bin 892 -> 0 bytes 2 files changed, 34 deletions(-) delete mode 100644 dsRag/Util/PdfUtil.py delete mode 100644 dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc diff --git a/dsRag/Util/PdfUtil.py b/dsRag/Util/PdfUtil.py deleted file mode 100644 index bdf85000..00000000 --- a/dsRag/Util/PdfUtil.py +++ /dev/null @@ -1,34 +0,0 @@ -import PyPDF2 -import os - - -def read_pdf_file(file_path): - """ - 读取PDF文件内容 - :param file_path: PDF文件路径 - :return: 文档文本内容 - """ - try: - # 检查文件是否存在 - if not os.path.exists(file_path): - raise FileNotFoundError(f"文件 {file_path} 不存在") - - # 检查文件是否为PDF - if not file_path.lower().endswith('.pdf'): - raise ValueError("仅支持.pdf格式的文件") - - text = "" - - # 以二进制模式打开PDF文件 - with open(file_path, 'rb') as file: - reader = PyPDF2.PdfReader(file) - - # 逐页读取内容 - for page in reader.pages: - text += page.extract_text() + "\n" - - return text.strip() - - except Exception as e: - print(f"读取PDF文件时出错: {str(e)}") - return None \ No newline at end of file diff --git a/dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc b/dsRag/Util/__pycache__/PdfUtil.cpython-310.pyc deleted file mode 100644 index d539d23446d721a333a31b8c857515f6832f706c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 892 zcmZ8g%}*0S6n`_j+iuskAO|tXp$9MZK%y6u#;A!vJP`uX#s))bXMkequCuc#kgZV* za)EFm_yHtsylA3w!w-mmfPcbVr4{}KW7Id45aQdsk9qI+-oE!H<0cXgAg#>hrj`wW zANp{bC?0Mi`@JX_Fj9hCTaz4Nlo<<Ao0#}q-W>)%55zoVGHEn%9} zX$TGCcSwVg5H^6(Eu%)Bp0kkF5%+K-vkoo-T!Su)W({T_5frY^^Wt z@9s38FE&4ZIf`VwyvWa_CW<9K?&alFCUt^3__W>pw$MfiE(1}{q%gnLc%$X!`%_hd zDE_HkI~C}CruILtG(Ws-ZoZv2`#d%g=-b)fU2Lswx0au3PpiJy+*|$mdZC@BKvD5P z5oajqLFj)Cw_fZtm%bjZy~?E0or=1?GIg^m!e@)V^wrrx?0?9WgKkh}146h$StYl^ zg|c~>`IRDKoxAx`z}rF7^>|rDhUakDE4kP(n;7AF#zoHZ@{`=p#rdoh`GOpmXei5< zqUfpEz-)ngvgnqT<%wchDgy)QNG^7=J!KvxnA)KtqFobPo#cw}cOcH;Xus0^nK9;P zUGWgruyCh&L83L1pB%d*i={E_M(aLrP9=1SaopCpu4rMle-QgGcF5C{JMS_zJ3xm*d%WJ E18r6ibN~PV From 87e02601c8d56029ed57257184b4be30f0d7e3a9 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 30 Jun 2025 07:53:08 +0800 Subject: [PATCH 6/8] 'commit' --- dsRag/ElasticSearch/T2_SplitTxt.py | 9 +-- dsRag/Util/WordImageUtil.py | 72 ------------------ .../__pycache__/WordImageUtil.cpython-310.pyc | Bin 1766 -> 0 bytes 3 files changed, 3 insertions(+), 78 deletions(-) delete mode 100644 dsRag/Util/WordImageUtil.py delete mode 100644 dsRag/Util/__pycache__/WordImageUtil.cpython-310.pyc diff --git a/dsRag/ElasticSearch/T2_SplitTxt.py b/dsRag/ElasticSearch/T2_SplitTxt.py index 2d1c224f..2ecc8660 100644 --- a/dsRag/ElasticSearch/T2_SplitTxt.py +++ b/dsRag/ElasticSearch/T2_SplitTxt.py @@ -1,13 +1,10 @@ -import re -import warnings - -import docx - import os +import re import shutil -import uuid +import warnings import zipfile +import docx from docx import Document from docx.oxml.ns import nsmap diff --git a/dsRag/Util/WordImageUtil.py b/dsRag/Util/WordImageUtil.py deleted file mode 100644 index a887f482..00000000 --- a/dsRag/Util/WordImageUtil.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import shutil -import uuid -import zipfile - -from docx import Document -from docx.oxml.ns import nsmap - - -def extract_images_from_docx(docx_path, output_folder): - """ - 从docx提取图片并记录位置 - :param docx_path: Word文档路径 - :param output_folder: 图片输出文件夹 - :return: 包含图片路径和位置的列表 - """ - # 创建一个List 记录每个图片的名称和序号 - image_data = [] - # 创建临时解压目录 - temp_dir = os.path.join(output_folder, "temp_docx") - os.makedirs(temp_dir, exist_ok=True) - - # 解压docx文件 - with zipfile.ZipFile(docx_path, 'r') as zip_ref: - zip_ref.extractall(temp_dir) - - # 读取主文档关系 - with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file: - rels_content = rels_file.read() - - # 加载主文档 - doc = Document(docx_path) - img_counter = 1 - - # 遍历所有段落 - for para_idx, paragraph in enumerate(doc.paragraphs): - for run_idx, run in enumerate(paragraph.runs): - # 检查运行中的图形 - for element in run._element: - if element.tag.endswith('drawing'): - # 提取图片关系ID - blip = element.find('.//a:blip', namespaces=nsmap) - if blip is not None: - embed_id = blip.get('{%s}embed' % nsmap['r']) - - # 从关系文件中获取图片文件名 - rel_entry = f'`xYNL#B)WLls zFvU5^G>|l*Q9`wA1S78ZO9C#-$xbcPh2AdAK?e}=%S+y@?Z#}!A%#CT5V_zcP#g*+-6fYpR3i-wrEOy*EF z`CrCjvOhr^bRsAHNjjP1T+614-6=7p;wt!Q#nURBoaQtA*&KD|vKcY6i^TLUx=FHG zLHhGzR$+bM8G}T!IsQ`rkeCv4IaEg&wE8y6j0@`*oC_br&dbB<2m(zp)%^piY%aif zaQAB(x3(VcTp#S*80_AA^wpKYo!k4jejn_8_h9eZqrF>&*-GLvH(XGrPU7;7m4&l0 zqYrOfd6@sW|L1RmdzYR!i@QAOa%VjbsK-_ohHm%oeLJ}F7aVzT_x9kYJ41Ws@h*$N z*@DR<)Y60?(1--MuK z$zfiQ_Qt`c==bJU7&MjYF?2PO70TRAKWg`?&E;ixJ&`T4y<)yvq=dY%j z7oPKIuW|IW7r5MyqjbYh7EaJ(i%;-k&p6|e&ET9?I*_-V|rVhxh?Jnfz;!~i=@sxmooJ%Wz~xydggMl zQ{cCmn{0q@){RnWI9}kXw=6a8wgH8RrklRHbls0A)06`Z0+bczB>^NQ%R$PSpU83n z7Nv)Vp0DKvVRH)Hb{=S#}c$hRSlf(d9nG){2vhvdluxJa1`UR-d(jU3;O_ zw&{m%+jFSPUE3HI%4fF99BQ&|S5*}p1%hoAY%7j9iMK= z>a*_OF?0xN)n#bMFU>erBd07N(Hq>k&Fg%ABRubV&(UB?ZrflXrJ zIx%nqTbhAQJcccphc%0g5w&OHI<|;G$QR^3`P*F7F?$OfE>+nmEbMhfUQuKMh}2ab uhuvl=wi&CJ5~W4|6bFkGpCfOoW5&@irrp4MM?I&s4#YqPR?~tIEbU)ev?d4u From 7df4b5bae956ad8a43f2311c4ddafd504574d745 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 30 Jun 2025 08:02:20 +0800 Subject: [PATCH 7/8] 'commit' --- dsRag/ElasticSearch/T4_SelectAllData.py | 2 +- dsRag/ElasticSearch/T6_XiangLiangQuery.py | 19 +++++++------------ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/dsRag/ElasticSearch/T4_SelectAllData.py b/dsRag/ElasticSearch/T4_SelectAllData.py index b389c5de..99418857 100644 --- a/dsRag/ElasticSearch/T4_SelectAllData.py +++ b/dsRag/ElasticSearch/T4_SelectAllData.py @@ -1,8 +1,8 @@ import warnings from elasticsearch import Elasticsearch + from Config import Config -import urllib3 # 抑制HTTPS相关警告 warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') diff --git a/dsRag/ElasticSearch/T6_XiangLiangQuery.py b/dsRag/ElasticSearch/T6_XiangLiangQuery.py index 2c0a9a87..91a9b3d0 100644 --- a/dsRag/ElasticSearch/T6_XiangLiangQuery.py +++ b/dsRag/ElasticSearch/T6_XiangLiangQuery.py @@ -14,9 +14,7 @@ esClient = EsSearchUtil(ES_CONFIG) warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') - - -def main(): +if __name__ == "__main__": # 测试查询 query = "小学数学中有哪些模型" query_tags = ["MATH_1"] # 默认搜索标签,可修改 @@ -32,7 +30,7 @@ def main(): query_embedding = esClient.text_to_embedding(query) print(f"2. 生成的查询向量维度: {len(query_embedding)}") print(f"3. 前3维向量值: {query_embedding[:3]}") - + print("4. 正在执行Elasticsearch向量搜索...") vector_results = es_conn.search( index=ES_CONFIG['index_name'], @@ -61,7 +59,7 @@ def main(): } ) print(f"5. 向量搜索结果数量: {len(vector_results['hits']['hits'])}") - + # 文本精确搜索 print("\n=== 文本精确搜索阶段 ===") print("1. 正在执行Elasticsearch文本精确搜索...") @@ -88,7 +86,7 @@ def main(): } ) print(f"2. 文本搜索结果数量: {len(text_results['hits']['hits'])}") - + # 打印详细结果 print("\n=== 最终搜索结果 ===") print(f" 向量搜索结果: {len(vector_results['hits']['hits'])}条") @@ -96,15 +94,12 @@ def main(): print(f" {i}. 文档ID: {hit['_id']}, 相似度分数: {hit['_score']:.2f}") print(f" 内容: {hit['_source']['user_input']}") # print(f" 详细: {hit['_source']['tags']['full_content']}") - + print("\n文本精确搜索结果:") for i, hit in enumerate(text_results['hits']['hits']): - print(f" {i+1}. 文档ID: {hit['_id']}, 匹配分数: {hit['_score']:.2f}") + print(f" {i + 1}. 文档ID: {hit['_id']}, 匹配分数: {hit['_score']:.2f}") print(f" 内容: {hit['_source']['user_input']}") # print(f" 详细: {hit['_source']['tags']['full_content']}") - + finally: esClient.es_pool.release_connection(es_conn) - -if __name__ == "__main__": - main() From 398ceb887bd3ba99aff009b10babfbabdb0cbdc0 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 30 Jun 2025 08:37:13 +0800 Subject: [PATCH 8/8] 'commit' --- dsRag/Start.py | 4 +- dsRag/Util/EsSearchUtil.py | 54 +++++------------- .../__pycache__/EsSearchUtil.cpython-310.pyc | Bin 6631 -> 5791 bytes 3 files changed, 17 insertions(+), 41 deletions(-) diff --git a/dsRag/Start.py b/dsRag/Start.py index 587a86f4..ca0b5094 100644 --- a/dsRag/Start.py +++ b/dsRag/Start.py @@ -114,11 +114,11 @@ async def rag(request: fastapi.Request): query = data.get('query', '') query_tags = data.get('tags', []) # 调用es进行混合搜索 - search_results = queryByEs(query, query_tags, logger) + search_results = EsSearchUtil.queryByEs(query, query_tags, logger) # 构建提示词 context = "\n".join([ f"结果{i + 1}: {res['tags']['full_content']}" - for i, res in enumerate(search_results['vector_results'] + search_results['text_results']) + for i, res in enumerate(search_results['text_results']) ]) # 添加图片识别提示 prompt = f""" diff --git a/dsRag/Util/EsSearchUtil.py b/dsRag/Util/EsSearchUtil.py index 21fe14e5..09d1c569 100644 --- a/dsRag/Util/EsSearchUtil.py +++ b/dsRag/Util/EsSearchUtil.py @@ -201,47 +201,23 @@ class EsSearchUtil: ) logger.info(f"2. 文本搜索结果数量: {len(text_results['hits']['hits'])}") - # 合并结果 - logger.info("\n=== 最终搜索结果 ===") - logger.info(f"向量搜索结果: {len(vector_results['hits']['hits'])}条") - for i, hit in enumerate(vector_results['hits']['hits'], 1): - logger.info(f" {i}. 文档ID: {hit['_id']}, 相似度分数: {hit['_score']:.2f}") - logger.info(f" 内容: {hit['_source']['user_input']}") - - logger.info("文本精确搜索结果:") - for i, hit in enumerate(text_results['hits']['hits']): - logger.info(f" {i + 1}. 文档ID: {hit['_id']}, 匹配分数: {hit['_score']:.2f}") - logger.info(f" 内容: {hit['_source']['user_input']}") - - # 去重处理:去除vector_results和text_results中重复的user_input - vector_sources = [hit['_source'] for hit in vector_results['hits']['hits']] - text_sources = [hit['_source'] for hit in text_results['hits']['hits']] - - # 构建去重后的结果 - unique_text_sources = [] - text_user_inputs = set() - - # 先处理text_results,保留所有 - for source in text_sources: - text_user_inputs.add(source['user_input']) - unique_text_sources.append(source) - - # 处理vector_results,只保留不在text_results中的 - unique_vector_sources = [] - for source in vector_sources: - if source['user_input'] not in text_user_inputs: - unique_vector_sources.append(source) - - # 计算优化掉的记录数量和节约的tokens - removed_count = len(vector_sources) - len(unique_vector_sources) - saved_tokens = sum(len(source['user_input']) for source in vector_sources - if source['user_input'] in text_user_inputs) - - logger.info(f"优化掉 {removed_count} 条重复记录,节约约 {saved_tokens} tokens") + # 合并vector和text结果 + all_sources = [hit['_source'] for hit in vector_results['hits']['hits']] + \ + [hit['_source'] for hit in text_results['hits']['hits']] + + # 去重处理 + unique_sources = [] + seen_user_inputs = set() + + for source in all_sources: + if source['user_input'] not in seen_user_inputs: + seen_user_inputs.add(source['user_input']) + unique_sources.append(source) + + logger.info(f"合并后去重结果数量: {len(unique_sources)}条") search_results = { - "vector_results": unique_vector_sources, - "text_results": unique_text_sources + "text_results": unique_sources } return search_results finally: diff --git a/dsRag/Util/__pycache__/EsSearchUtil.cpython-310.pyc b/dsRag/Util/__pycache__/EsSearchUtil.cpython-310.pyc index 5b37225ac2c0070aeda27e896b2c3ed68961e042..b8660cd06a341531f537f6061a2ff2d6bcf4a41c 100644 GIT binary patch delta 588 zcmY+B&ubGw6vyZ7EZgjEvKU>H4YV{-my&}AQ4xw2q^UQxsGtU9T{AHP?T8| zB3ArS3xgnNHMx3H0)q7B!JGX9g4d)3JbKsJR73~fcjo(f@4=gy-Q(|$8qG{bLt>fV z-PIq(j|S7B%%AED%A$aNrN5)U2Uy~FGi!E65LHBy34Q`XAqt@bF=lv8>f#=X<$L44 zW}JchWR?^kBBl_lfRd=74v^A>P5sOAC{C@QSPfGh6l*M1K{w1;XF8cFAg0|0f55b^ zy61ZUh^bwT>B*`6l}TNh910btcHlK4hqrLT$q}a3ZHUt>eHBdx)0vzkF8}B5pt9Yc z&YqHZRyoK|o3rX*$_bj0U!(k{X~Q@E$Xq^fK6!%+iIfsq7Q)-6Ilq&*OG2UV|2&n7VRvn)PQA5DVHA?(!-cp~4d`i-=#Bb$SrcG>u{7aSPw=@Ur6K{?-la=ne_#(^NWR7zX8mqpt%45 delta 1412 zcma)6U2NM_6u#H?+{8(oG)+SbjJe&KE@Lg-x^_}oCc(O)y=;R5lOjeI)n3V9PIGOS zHFA`0>DCTf2w0mYHDL{)N<*j`ln1sIUU@^}ae3ioPF5tuD^HAbozkB-toZnx?|%1u z_nc$rPEWmGO#6HSf-Cu3^1bts??lss?^+*wV*D_G1tfQrS-Psb zHgVZsV207T-S03tSI+J1>oi@!-EF$%J?#j*F7P6drF(%Z^BdXs0vZK$C*_nG!1E~c zi<4mAx=7QeI7oDDk$BN`$^K_(@=clj|5FBK{+4^>-f6U1#4wbX6Y}y)4!j}jz@9Ob z^Xl#bx`9j&9K)aU&7wuY^wE<$)p!>@rjL3>oOv18qA4_Z4bCi@z(q{Xnc^1SY3Qv` zz+lA!ZbLc0?wvV{E?~J&J}~W~2?}&f0jD1aO+QrHZw9~~D(`SaMbj$}l$czz*?bl` z!}OUPT^ZRHeTbriD}VS8GkN1V`YOa?F{xIXsz02q6-)IWzN|mAYG0HjfQ=BW+r`h; zWkQFZ}p;b=7{bWY1o& z-Mlv>#rcFbnkGt??HxII!4QE3IpDIdU$d9*!|tYpmNv+!Vzg5c$Xfcg{{7GO%JQ}| zWEoHnI2!ic{k3cJ4FPLcaBv4X4Y>7gunRx3rh{kt&Ok_Br>29NH#R&c?4)4?OqDus zg0Y(U)_gxW+FmeMkV!|3O)R5!1fxTQ(NQBR;I|P*8Aiq)lX5RjK{>kz^`=^_Kt`Q9e+TnvxhztEw`p zr_yRPXcb!y