From fa674aa9d38cf3c44114f3b0a854729158f85907 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Sun, 29 Jun 2025 19:05:57 +0800 Subject: [PATCH] 'commit' --- dsRag/Start.py | 198 +----------------- dsRag/Util/SearchUtil.py | 189 +++++++++++++++++ .../__pycache__/SearchUtil.cpython-310.pyc | Bin 0 -> 5466 bytes 3 files changed, 190 insertions(+), 197 deletions(-) create mode 100644 dsRag/Util/SearchUtil.py create mode 100644 dsRag/Util/__pycache__/SearchUtil.cpython-310.pyc diff --git a/dsRag/Start.py b/dsRag/Start.py index 416cb578..1ac2e0ff 100644 --- a/dsRag/Start.py +++ b/dsRag/Start.py @@ -1,24 +1,6 @@ -import logging -import os -import subprocess -import tempfile import urllib.parse -import uuid -from io import BytesIO -from logging.handlers import RotatingFileHandler -from typing import List - -import uvicorn -from fastapi import FastAPI, Request, HTTPException -from fastapi.staticfiles import StaticFiles -from pydantic import BaseModel, Field -from starlette.responses import StreamingResponse - -from Config.Config import ES_CONFIG -import warnings -from Util.ALiYunUtil import ALiYunUtil -from Util.EsSearchUtil import EsSearchUtil +from Util.SearchUtil import * # 初始化日志 logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -42,10 +24,6 @@ logger.addHandler(file_handler) logger.addHandler(console_handler) -# 将HTML文件转换为Word文件 -def html_to_word_pandoc(html_file, output_file): - subprocess.run(['pandoc', html_file, '-o', output_file]) - async def lifespan(app: FastAPI): # 初始化阿里云大模型工具 @@ -63,15 +41,6 @@ app = FastAPI(lifespan=lifespan) app.mount("/static", StaticFiles(directory="Static"), name="static") -class QueryRequest(BaseModel): - query: str = Field(..., description="用户查询的问题") - documents: List[str] = Field(..., description="用户上传的文档") - - -class SaveWordRequest(BaseModel): - html: str = Field(..., description="要保存为Word的HTML内容") - - @app.post("/api/save-word") async def save_to_word(request: Request): output_file = None @@ -122,171 +91,6 @@ async def save_to_word(request: Request): logger.warning(f"Failed to clean up temp files: {str(e)}") -def queryByEs(query, query_tags): - # 获取EsSearchUtil实例 - es_search_util = EsSearchUtil(ES_CONFIG) - - # 执行混合搜索 - es_conn = es_search_util.es_pool.get_connection() - try: - # 向量搜索 - logger.info(f"\n=== 开始执行查询 ===") - logger.info(f"原始查询文本: {query}") - logger.info(f"查询标签: {query_tags}") - - logger.info("\n=== 向量搜索阶段 ===") - logger.info("1. 文本分词和向量化处理中...") - query_embedding = es_search_util.text_to_embedding(query) - logger.info(f"2. 生成的查询向量维度: {len(query_embedding)}") - logger.info(f"3. 前3维向量值: {query_embedding[:3]}") - - logger.info("4. 正在执行Elasticsearch向量搜索...") - vector_results = es_conn.search( - index=ES_CONFIG['index_name'], - body={ - "query": { - "script_score": { - "query": { - "bool": { - "should": [ - { - "terms": { - "tags.tags": query_tags - } - } - ], - "minimum_should_match": 1 - } - }, - "script": { - "source": "double score = cosineSimilarity(params.query_vector, 'embedding'); return score >= 0 ? score : 0", - "params": {"query_vector": query_embedding} - } - } - }, - "size": 3 - } - ) - logger.info(f"5. 向量搜索结果数量: {len(vector_results['hits']['hits'])}") - - # 文本精确搜索 - logger.info("\n=== 文本精确搜索阶段 ===") - logger.info("1. 正在执行Elasticsearch文本精确搜索...") - text_results = es_conn.search( - index=ES_CONFIG['index_name'], - body={ - "query": { - "bool": { - "must": [ - { - "match": { - "user_input": query - } - }, - { - "terms": { - "tags.tags": query_tags - } - } - ] - } - }, - "size": 3 - } - ) - logger.info(f"2. 文本搜索结果数量: {len(text_results['hits']['hits'])}") - - # 合并结果 - logger.info("\n=== 最终搜索结果 ===") - logger.info(f"向量搜索结果: {len(vector_results['hits']['hits'])}条") - for i, hit in enumerate(vector_results['hits']['hits'], 1): - logger.info(f" {i}. 文档ID: {hit['_id']}, 相似度分数: {hit['_score']:.2f}") - logger.info(f" 内容: {hit['_source']['user_input']}") - - logger.info("文本精确搜索结果:") - for i, hit in enumerate(text_results['hits']['hits']): - logger.info(f" {i + 1}. 文档ID: {hit['_id']}, 匹配分数: {hit['_score']:.2f}") - logger.info(f" 内容: {hit['_source']['user_input']}") - - # 去重处理:去除vector_results和text_results中重复的user_input - vector_sources = [hit['_source'] for hit in vector_results['hits']['hits']] - text_sources = [hit['_source'] for hit in text_results['hits']['hits']] - - # 构建去重后的结果 - unique_text_sources = [] - text_user_inputs = set() - - # 先处理text_results,保留所有 - for source in text_sources: - text_user_inputs.add(source['user_input']) - unique_text_sources.append(source) - - # 处理vector_results,只保留不在text_results中的 - unique_vector_sources = [] - for source in vector_sources: - if source['user_input'] not in text_user_inputs: - unique_vector_sources.append(source) - - # 计算优化掉的记录数量和节约的tokens - removed_count = len(vector_sources) - len(unique_vector_sources) - saved_tokens = sum(len(source['user_input']) for source in vector_sources - if source['user_input'] in text_user_inputs) - - logger.info(f"优化掉 {removed_count} 条重复记录,节约约 {saved_tokens} tokens") - - search_results = { - "vector_results": unique_vector_sources, - "text_results": unique_text_sources - } - return search_results - finally: - es_search_util.es_pool.release_connection(es_conn) - - -def callLLM(request, query, search_results): - # 调用阿里云大模型整合结果 - aliyun_util = request.app.state.aliyun_util - - # 构建提示词 - context = "\n".join([ - f"结果{i + 1}: {res['tags']['full_content']}" - for i, res in enumerate(search_results['vector_results'] + search_results['text_results']) - ]) - - # 添加图片识别提示 - prompt = f""" - 信息检索与回答助手 - 根据以下关于'{query}'的相关信息: - - 基本信息 - - 语言: 中文 - - 描述: 根据提供的材料检索信息并回答问题 - - 特点: 快速准确提取关键信息,清晰简洁地回答 - - 相关信息 - {context} - - 回答要求 - 1. 严格保持原文中图片与上下文的顺序关系,确保语义相关性 - 2. 图片引用使用Markdown格式: ![图片描述](图片路径) - 3. 使用Markdown格式返回,包含适当的标题、列表和代码块 - 4. 对于提供Latex公式的内容,尽量保留Latex公式 - 5. 直接返回Markdown内容,不要包含额外解释或说明 - 6. 依托给定的资料,快速准确地回答问题,可以添加一些额外的信息,但请勿重复内容 - 7. 如果未提供相关信息,请不要回答 - 8. 如果发现相关信息与原来的问题契合度低,也不要回答 - 9. 确保内容结构清晰,便于前端展示 - """ - - # 调用阿里云大模型 - if len(context) > 0: - # 调用大模型生成回答 - logger.info("正在调用阿里云大模型生成回答...") - markdown_content = aliyun_util.chat(prompt) - logger.info(f"调用阿里云大模型生成回答成功完成!") - return markdown_content - return None - @app.post("/api/rag") async def rag(request: Request): diff --git a/dsRag/Util/SearchUtil.py b/dsRag/Util/SearchUtil.py new file mode 100644 index 00000000..97821452 --- /dev/null +++ b/dsRag/Util/SearchUtil.py @@ -0,0 +1,189 @@ +import logging +import os +import subprocess +import tempfile +import urllib.parse +import uuid +from io import BytesIO +from logging.handlers import RotatingFileHandler +from typing import List + +import uvicorn +from fastapi import FastAPI, Request, HTTPException +from fastapi.staticfiles import StaticFiles +from pydantic import BaseModel, Field +from starlette.responses import StreamingResponse + +from Config.Config import ES_CONFIG +import warnings +from Util.ALiYunUtil import ALiYunUtil +from Util.EsSearchUtil import EsSearchUtil + +# 初始化日志 +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +def queryByEs(query, query_tags): + # 获取EsSearchUtil实例 + es_search_util = EsSearchUtil(ES_CONFIG) + + # 执行混合搜索 + es_conn = es_search_util.es_pool.get_connection() + try: + # 向量搜索 + logger.info(f"\n=== 开始执行查询 ===") + logger.info(f"原始查询文本: {query}") + logger.info(f"查询标签: {query_tags}") + + logger.info("\n=== 向量搜索阶段 ===") + logger.info("1. 文本分词和向量化处理中...") + query_embedding = es_search_util.text_to_embedding(query) + logger.info(f"2. 生成的查询向量维度: {len(query_embedding)}") + logger.info(f"3. 前3维向量值: {query_embedding[:3]}") + + logger.info("4. 正在执行Elasticsearch向量搜索...") + vector_results = es_conn.search( + index=ES_CONFIG['index_name'], + body={ + "query": { + "script_score": { + "query": { + "bool": { + "should": [ + { + "terms": { + "tags.tags": query_tags + } + } + ], + "minimum_should_match": 1 + } + }, + "script": { + "source": "double score = cosineSimilarity(params.query_vector, 'embedding'); return score >= 0 ? score : 0", + "params": {"query_vector": query_embedding} + } + } + }, + "size": 3 + } + ) + logger.info(f"5. 向量搜索结果数量: {len(vector_results['hits']['hits'])}") + + # 文本精确搜索 + logger.info("\n=== 文本精确搜索阶段 ===") + logger.info("1. 正在执行Elasticsearch文本精确搜索...") + text_results = es_conn.search( + index=ES_CONFIG['index_name'], + body={ + "query": { + "bool": { + "must": [ + { + "match": { + "user_input": query + } + }, + { + "terms": { + "tags.tags": query_tags + } + } + ] + } + }, + "size": 3 + } + ) + logger.info(f"2. 文本搜索结果数量: {len(text_results['hits']['hits'])}") + + # 合并结果 + logger.info("\n=== 最终搜索结果 ===") + logger.info(f"向量搜索结果: {len(vector_results['hits']['hits'])}条") + for i, hit in enumerate(vector_results['hits']['hits'], 1): + logger.info(f" {i}. 文档ID: {hit['_id']}, 相似度分数: {hit['_score']:.2f}") + logger.info(f" 内容: {hit['_source']['user_input']}") + + logger.info("文本精确搜索结果:") + for i, hit in enumerate(text_results['hits']['hits']): + logger.info(f" {i + 1}. 文档ID: {hit['_id']}, 匹配分数: {hit['_score']:.2f}") + logger.info(f" 内容: {hit['_source']['user_input']}") + + # 去重处理:去除vector_results和text_results中重复的user_input + vector_sources = [hit['_source'] for hit in vector_results['hits']['hits']] + text_sources = [hit['_source'] for hit in text_results['hits']['hits']] + + # 构建去重后的结果 + unique_text_sources = [] + text_user_inputs = set() + + # 先处理text_results,保留所有 + for source in text_sources: + text_user_inputs.add(source['user_input']) + unique_text_sources.append(source) + + # 处理vector_results,只保留不在text_results中的 + unique_vector_sources = [] + for source in vector_sources: + if source['user_input'] not in text_user_inputs: + unique_vector_sources.append(source) + + # 计算优化掉的记录数量和节约的tokens + removed_count = len(vector_sources) - len(unique_vector_sources) + saved_tokens = sum(len(source['user_input']) for source in vector_sources + if source['user_input'] in text_user_inputs) + + logger.info(f"优化掉 {removed_count} 条重复记录,节约约 {saved_tokens} tokens") + + search_results = { + "vector_results": unique_vector_sources, + "text_results": unique_text_sources + } + return search_results + finally: + es_search_util.es_pool.release_connection(es_conn) + + +def callLLM(request, query, search_results): + # 调用阿里云大模型整合结果 + aliyun_util = request.app.state.aliyun_util + + # 构建提示词 + context = "\n".join([ + f"结果{i + 1}: {res['tags']['full_content']}" + for i, res in enumerate(search_results['vector_results'] + search_results['text_results']) + ]) + + # 添加图片识别提示 + prompt = f""" + 信息检索与回答助手 + 根据以下关于'{query}'的相关信息: + + 基本信息 + - 语言: 中文 + - 描述: 根据提供的材料检索信息并回答问题 + - 特点: 快速准确提取关键信息,清晰简洁地回答 + + 相关信息 + {context} + + 回答要求 + 1. 严格保持原文中图片与上下文的顺序关系,确保语义相关性 + 2. 图片引用使用Markdown格式: ![图片描述](图片路径) + 3. 使用Markdown格式返回,包含适当的标题、列表和代码块 + 4. 对于提供Latex公式的内容,尽量保留Latex公式 + 5. 直接返回Markdown内容,不要包含额外解释或说明 + 6. 依托给定的资料,快速准确地回答问题,可以添加一些额外的信息,但请勿重复内容 + 7. 如果未提供相关信息,请不要回答 + 8. 如果发现相关信息与原来的问题契合度低,也不要回答 + 9. 确保内容结构清晰,便于前端展示 + """ + + # 调用阿里云大模型 + if len(context) > 0: + # 调用大模型生成回答 + logger.info("正在调用阿里云大模型生成回答...") + markdown_content = aliyun_util.chat(prompt) + logger.info(f"调用阿里云大模型生成回答成功完成!") + return markdown_content + return None diff --git a/dsRag/Util/__pycache__/SearchUtil.cpython-310.pyc b/dsRag/Util/__pycache__/SearchUtil.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da6c9a626fd7d8d0a78ca006e5a063a6d08c3b5c GIT binary patch literal 5466 zcma)A>2nmv6`xD9hgKI7hd9gt=2#nVu<;cIhz}s93WEzdab*IvS+)l;V)kGTj8v+aQRW#cNZG(Fv~U%&Ud z`}KRjosp5tzwvOaijRhL*os2jQ>J`7{qFfOy8``k~SxEPtCh)R&|jjJ$13ECSfyMUtCFiopQtU0 zRm;_+?$K&ui{-_!CGrx|_G(LG%j9LG?$c^x%jM;y&T1><6(G;At&FXbR}osN_DpQG zyqaYcCM_vzMp${RvNpX=VH%$uVSyun$~qW%Jt{{P@ZNyfdL~#o3v*`n1zFqkR+Fh1 zd-g-M`ar^rm}OPCD2L6?|El1PcNp)Zu>nexG1+#u zg-dabT$&$YTKECBxy=;PC5Xi3m8V!0{u)?RfhvbMl|@x(QA&V!^$>?@ z2*ryQLw!jv*HS{XE(NV6Xc@?W=O@~03-&JmiM{|@fmWhbDF*8+(hNGGJ+DI~I-UU? z;xFh}UC<%@)DpC2m?yoyc?(fRJGYZLwC*6+;%V^`>1$J-K?bc$dC{|YmC$;qZGib~ z92ADdg7i&54FYx5{7N4G7a9LtUPqCIpkaNHhUZfrRIwDxp4YPZS8MqJy?{a;%={d7 z!aBG3&~FH}pp{Vl4NOa^>2GJcnUo)_sAvgvG3nA&0Nai2OqHessQ|7Gu*ZJ~++`^r z<)b`Z%n5dY)eoi0ium@!%KF+_D9hT}WlT%CS=PxMVbK=!Vuu%xZ^yn}L003XR5{FZ zYpMdumxrmJmZeJ3D??mMWvcQ6@LD(l6x_! zxHfzFYj^Z3B5NZIc`+nW5l+{c+*q&k_a28ZJ3W`28y(q!Gnw&wp-{+@fcK@4l)Z4( z?Hb5_(m_>G-t5!^XYv*(TntQGLz2_oyA{)@pw{U+X%vWH=s)DRRJw>T1#q8_|TW*uvpN0wR~G=rN<&XGR)~5H4m} z&=BNg(Kwp0G$hwRFjHezEKJ+sSj3DTZT>4tScf%5BBG@2QZ!+xaiw03saix=&8AJs zh#rXQ5tRMu@abxgzlCGGR9?y%|v0d6C{XXBQleWl#;-L_2 zMI&M9QN-qs zUM@O=x@Ha(R}vLY&&2HM-Z}YE>}GR6oc9EH>f!kwpUws*#T*h_*c2d@TIM`=oU3Q< zvidX7b(guz*<5Ig&BKoAc*98Xdm_+e#$hUfjvptfOSY~Qv9C9a%I>N`$D)4-m=wTeuD^*<3=;Yst#r>(=> z5DUYS%=BdlZ*E_=MEq9*FPrV`b#CV)7zg(G-AkMLL32 zG?@5l5W;a$E(d$4%&-LxMeM^?$rY4oUO$e}`etkdtLo3R0BiqQe@Q6+Gh(mk_z~U5#gj#?J zsHB!z3P1yc4Gejah`})efD5H}xCG#635LJOcTCV$hH3Iu=*1zC5V+*lWIG7pO3rH|q1Yi!pJHp49B|r{_j{;aKzFy?4_m>r=c&%(XFxouqINj}qQRgyr8eS+wM&y=15uY@# zfXJrp-TUs-f}CFPUh54Q{@e^?D5}iuAILl!#9(T8z`b-OZy(X(OngnPoc(xo_S)ql5!vpE>?ufM z&dg`Ctyi7SGdRol4>tIhr5RDlQ3LuUM@o1V_h47)vT07hYsFjHsnQMMZ@eVOsIu!cZ{naz$&I+Oii zS@z2*h^t^LjDqFKOmtHN+}7I#$p9p%5NG;g_QIXa!x<>wjp)Zv;=?#7bfy9AtoaM& zp-Xva6K&JIbA~@Ihy>*7j7@-F&~4ibQTe2E`g1S}1W;&!1>b)NpbRFNy?CXFr62{6 zTz2q*+jo-z_{?}OtaskLYs1csOS#+EXFJcjU6*oW58TUr1xc^M6dw({-Iudd zSDevLz=+&e9pFaTh$7!8Y(4cP+}l3}Zti|F<(y4t##=L!gR?{tNGLc@<{{h$zH!dY z(8Ng1E2w@Amg3ea3_*wfk$2(+C!`0)<|jgqT4CVpa|50J^Vz=p#XgA7CAiY?O)!I+ z>fAi<40Hjm%RKA@vojM{pDOdWu%g7RsF5W6bWrEVqaMw`mcflA``MWD#l`H6$$~LD z3&-1f0SbzSbNAc9-)1kvoO?2p=ban3-8&=Br|0OMhn@v|6~Yku*YLtc$vJz~8SR0} znxGG|cQR?081NlNu}dSG+GNE^HnxSvgc{ekfLN0Lb!?Izfp?0}_fntC_{cN_z* zI<#dAmZc(_QxiIdsd8mrPUtB8h-K(F<=CRxlmzKI2GX|2I-$ZXFD`qIz~wNKRP_=d z>o~XQI62AHd3p%1r5eTB4IL*5*_&)akvPzFALhnjPp$?oT~ka`2|4W2y+X_{$f8eYOvP-d>9BykT->4Ts4_ z+Fn3zdzBOLrA^qgZ})xz1<0}%<-0=Cv%TA531n%?4(yyH1$!AUJNFl0@t>EgfwB_U NIWM93c}xv7{U2Y;n=t?Y literal 0 HcmV?d00001