From aa852ce369bf6b941f0da6fca37ee54ed0a06421 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 19 Aug 2025 14:03:33 +0800 Subject: [PATCH] 'commit' --- dsSchoolBuddy/Config/Config.py | 27 - dsSchoolBuddy/Config/__init__.py | 0 .../Config/__pycache__/Config.cpython-310.pyc | Bin 852 -> 0 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 137 -> 0 bytes .../ElasticSearch/T1_RebuildMapping.py | 32 - dsSchoolBuddy/ElasticSearch/T2_Vector.py | 38 - dsSchoolBuddy/ElasticSearch/T3_InsertData.py | 28 - .../ElasticSearch/T4_SelectAllData.py | 28 - .../ElasticSearch/T5_SelectByKeyWord.py | 28 - .../ElasticSearch/T6_SelectByVector.py | 29 - .../ElasticSearch/T7_XiangLiangQuery.py | 66 -- .../Utils/ElasticsearchCollectionManager.py | 110 --- .../Utils/ElasticsearchConnectionPool.py | 65 -- .../ElasticSearch/Utils/EsSearchUtil.py | 664 ------------------ .../ElasticSearch/Utils/VectorDBUtil.py | 125 ---- dsSchoolBuddy/ElasticSearch/Utils/__init__.py | 0 ...ticsearchCollectionManager.cpython-310.pyc | Bin 3488 -> 0 bytes ...lasticsearchConnectionPool.cpython-310.pyc | Bin 2754 -> 0 bytes .../__pycache__/EsSearchUtil.cpython-310.pyc | Bin 16161 -> 0 bytes .../__pycache__/VectorDBUtil.cpython-310.pyc | Bin 4136 -> 0 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 158 -> 0 bytes dsSchoolBuddy/ElasticSearch/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 144 -> 0 bytes dsSchoolBuddy/Start.py | 212 ------ 24 files changed, 1452 deletions(-) delete mode 100644 dsSchoolBuddy/Config/Config.py delete mode 100644 dsSchoolBuddy/Config/__init__.py delete mode 100644 dsSchoolBuddy/Config/__pycache__/Config.cpython-310.pyc delete mode 100644 dsSchoolBuddy/Config/__pycache__/__init__.cpython-310.pyc delete mode 100644 dsSchoolBuddy/ElasticSearch/T1_RebuildMapping.py delete mode 100644 dsSchoolBuddy/ElasticSearch/T2_Vector.py delete mode 100644 dsSchoolBuddy/ElasticSearch/T3_InsertData.py delete mode 100644 dsSchoolBuddy/ElasticSearch/T4_SelectAllData.py delete mode 100644 dsSchoolBuddy/ElasticSearch/T5_SelectByKeyWord.py delete mode 100644 dsSchoolBuddy/ElasticSearch/T6_SelectByVector.py delete mode 100644 dsSchoolBuddy/ElasticSearch/T7_XiangLiangQuery.py delete mode 100644 dsSchoolBuddy/ElasticSearch/Utils/ElasticsearchCollectionManager.py delete mode 100644 dsSchoolBuddy/ElasticSearch/Utils/ElasticsearchConnectionPool.py delete mode 100644 dsSchoolBuddy/ElasticSearch/Utils/EsSearchUtil.py delete mode 100644 dsSchoolBuddy/ElasticSearch/Utils/VectorDBUtil.py delete mode 100644 dsSchoolBuddy/ElasticSearch/Utils/__init__.py delete mode 100644 dsSchoolBuddy/ElasticSearch/Utils/__pycache__/ElasticsearchCollectionManager.cpython-310.pyc delete mode 100644 dsSchoolBuddy/ElasticSearch/Utils/__pycache__/ElasticsearchConnectionPool.cpython-310.pyc delete mode 100644 dsSchoolBuddy/ElasticSearch/Utils/__pycache__/EsSearchUtil.cpython-310.pyc delete mode 100644 dsSchoolBuddy/ElasticSearch/Utils/__pycache__/VectorDBUtil.cpython-310.pyc delete mode 100644 dsSchoolBuddy/ElasticSearch/Utils/__pycache__/__init__.cpython-310.pyc delete mode 100644 dsSchoolBuddy/ElasticSearch/__init__.py delete mode 100644 dsSchoolBuddy/ElasticSearch/__pycache__/__init__.cpython-310.pyc delete mode 100644 dsSchoolBuddy/Start.py diff --git a/dsSchoolBuddy/Config/Config.py b/dsSchoolBuddy/Config/Config.py deleted file mode 100644 index 7e96bc26..00000000 --- a/dsSchoolBuddy/Config/Config.py +++ /dev/null @@ -1,27 +0,0 @@ -# Elasticsearch配置 -ES_CONFIG = { - 'hosts': ['https://localhost:9200'], - 'basic_auth': ('elastic', 'jv9h8uwRrRxmDi1dq6u8'), - 'verify_certs': False, - 'index_name': 'ds_db', # 默认索引名称 - 'student_info_index': 'student_info' # 添加student_info索引名称配置 -} - -# 嵌入向量模型 -EMBED_MODEL_NAME = "BAAI/bge-m3" -EMBED_API_KEY = "sk-pbqibyjwhrgmnlsmdygplahextfaclgnedetybccknxojlyl" -EMBED_BASE_URL = "https://api.siliconflow.cn/v1" -EMBED_DIM = 1024 -EMBED_MAX_TOKEN_SIZE = 8192 - -# 重排模型 -RERANK_MODEL = 'BAAI/bge-reranker-v2-m3' -RERANK_BASE_URL = 'https://api.siliconflow.cn/v1/rerank' -RERANK_BINDING_API_KEY = 'sk-pbqibyjwhrgmnlsmdygplahextfaclgnedetybccknxojlyl' - -# 阿里云API信息【HZKJ】 -ALY_LLM_API_KEY = "sk-01d13a39e09844038322108ecdbd1bbc" -ALY_LLM_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1" -ALY_LLM_MODEL_NAME = "qwen-plus" -#ALY_LLM_MODEL_NAME = "deepseek-r1" -# ALY_LLM_MODEL_NAME = "deepseek-v3" \ No newline at end of file diff --git a/dsSchoolBuddy/Config/__init__.py b/dsSchoolBuddy/Config/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dsSchoolBuddy/Config/__pycache__/Config.cpython-310.pyc b/dsSchoolBuddy/Config/__pycache__/Config.cpython-310.pyc deleted file mode 100644 index e1f95488be20eb743d1e832fbca43079cf56dabd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 852 zcmaJ=OK;Oa5O$u;vn^FsPDlv2N=Eq*tDCM4Aq*8!h*PT9B7!h9*=U=0{JR)>01Z?;LT-8wz3 zHCxR_qh4zf3nmlTjZ~zd!cZB?gR6}sI@qGe{F5D_4!Y{#Qrp&v4&>gg*6Z~jiNhVS zOO1|p%)f4bNNz^l+Xwf%(LK8Z{@HeCBHXz4lweOhjj(Cktm6X&cjw~R=wurzP7r0@ hi7oOOc^_N<1FwWj#^bS#rT-+e$=KEs4^j9`>@S5e|4RS> diff --git a/dsSchoolBuddy/Config/__pycache__/__init__.cpython-310.pyc b/dsSchoolBuddy/Config/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index b3f99ca71b3b92c5c36e61e071899a9af29e37da..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 137 zcmd1j<>g`kf&)(@GeGoX5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!HHi&acYad>`F zHjoM^%FjwoE&;NF64PUx^YhX&(_`Y}GxIV_;^XxSDsOSv list: - """ - 将文本切割成块 - - 参数: - text: 要切割的文本 - chunk_size: 每个块的大小 - chunk_overlap: 块之间的重叠大小 - - 返回: - list: 文本块列表 - """ - # 创建文档对象 - docs = [Document(page_content=text, metadata={"source": "simulated_document"})] - - # 切割文档 - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=True - ) - all_splits = text_splitter.split_documents(docs) - print(f"切割后的文档块数量:{len(all_splits)}") - - return [split.page_content for split in all_splits] - - def insert_long_text_to_es(self,long_text: str, tags: list = None) -> bool: - """ - 将长文本切割后向量化并插入到Elasticsearch,基于文本内容哈希实现去重 - - 参数: - long_text: 要插入的长文本 - tags: 可选的标签列表 - - 返回: - bool: 插入是否成功 - """ - try: - # 1. 创建EsSearchUtil实例以使用连接池 - search_util = EsSearchUtil(Config.ES_CONFIG) - - # 2. 从连接池获取连接 - conn = search_util.es_pool.get_connection() - - # # 3. 检查索引是否存在,不存在则创建 - index_name = Config.ES_CONFIG['index_name'] - # if not conn.indices.exists(index=index_name): - # # 定义mapping结构 - # mapping = { - # "mappings": { - # "properties": { - # "embedding": { - # "type": "dense_vector", - # "dims": Config.EMBED_DIM, # 根据实际embedding维度调整 - # "index": True, - # "similarity": "l2_norm" - # }, - # "user_input": {"type": "text"}, - # "tags": { - # "type": "object", - # "properties": { - # "tags": {"type": "keyword"}, - # "full_content": {"type": "text"} - # } - # }, - # "timestamp": {"type": "date"} - # } - # } - # } - # conn.indices.create(index=index_name, body=mapping) - # print(f"索引 '{index_name}' 创建成功") - - # 4. 切割文本 - text_chunks = self.split_text_into_chunks(long_text) - - # 5. 准备标签 - if tags is None: - tags = ["general_text"] - - # 6. 获取当前时间 - timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - - # 7. 创建嵌入模型 - embeddings = OpenAIEmbeddings( - model=Config.EMBED_MODEL_NAME, - base_url=Config.EMBED_BASE_URL, - api_key=SecretStr(Config.EMBED_API_KEY) - ) - - # 8. 为每个文本块生成向量并插入 - for i, chunk in enumerate(text_chunks): - # 生成文本块的哈希值作为文档ID - doc_id = hashlib.md5(chunk.encode('utf-8')).hexdigest() - - # 检查文档是否已存在 - if conn.exists(index=index_name, id=doc_id): - print(f"文档块 {i+1} 已存在,跳过插入: {doc_id}") - continue - - # 生成文本块的嵌入向量 - embedding = embeddings.embed_documents([chunk])[0] - - # 准备文档数据 - doc = { - 'tags': {"tags": tags, "full_content": long_text}, - 'user_input': chunk, - 'timestamp': timestamp, - 'embedding': embedding - } - - # 插入数据到Elasticsearch - conn.index(index=index_name, id=doc_id, document=doc) - print(f"文档块 {i+1} 插入成功: {doc_id}") - - return True - except Exception as e: - print(f"插入数据失败: {e}") - return False - finally: - # 确保释放连接回连接池 - if 'conn' in locals() and 'search_util' in locals(): - search_util.es_pool.release_connection(conn) - - def get_query_embedding(self, query: str) -> list: - """ - 将查询文本转换为向量 - - 参数: - query: 查询文本 - - 返回: - list: 向量表示 - """ - # 创建嵌入模型 - embeddings = OpenAIEmbeddings( - model=Config.EMBED_MODEL_NAME, - base_url=Config.EMBED_BASE_URL, - api_key=SecretStr(Config.EMBED_API_KEY) - ) - - # 生成查询向量 - query_embedding = embeddings.embed_query(query) - return query_embedding - - def rerank_results(self, query: str, results: list) -> list: - """ - 使用重排模型对搜索结果进行重排 - - 参数: - query: 查询文本 - results: 搜索结果列表 - - 返回: - list: 重排后的结果列表,每个元素是(文档对象, 分数)的元组 - """ - if not results: - print("警告: 没有搜索结果可供重排") - return [] - - try: - # 准备重排请求数据 - # 确保doc是字典并包含'_source'和'user_input'字段 - documents = [] - valid_results = [] - for i, doc in enumerate(results): - if isinstance(doc, dict) and '_source' in doc and 'user_input' in doc['_source']: - documents.append(doc['_source']['user_input']) - valid_results.append(doc) - else: - print(f"警告: 结果项 {i} 格式不正确,跳过该结果") - print(f"结果项内容: {doc}") - - if not documents: - print("警告: 没有有效的文档可供重排") - # 返回原始结果,但转换为(结果, 分数)的元组格式 - return [(doc, doc.get('_score', 0.0)) for doc in results] - - rerank_data = { - "model": Config.RERANK_MODEL, - "query": query, - "documents": documents, - "top_n": len(documents) - } - - # 调用重排API - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {Config.RERANK_BINDING_API_KEY}" - } - - response = requests.post(Config.RERANK_BASE_URL, headers=headers, data=json.dumps(rerank_data)) - response.raise_for_status() # 检查请求是否成功 - rerank_result = response.json() - - # 处理重排结果 - reranked_results = [] - if "results" in rerank_result: - for item in rerank_result["results"]: - doc_idx = item.get("index") - score = item.get("relevance_score", 0.0) - if 0 <= doc_idx < len(valid_results): - result = valid_results[doc_idx] - reranked_results.append((result, score)) - else: - print("警告: 无法识别重排API响应格式") - # 返回原始结果,但转换为(结果, 分数)的元组格式 - reranked_results = [(doc, doc.get('_score', 0.0)) for doc in valid_results] - - print(f"重排后结果数量:{len(reranked_results)}") - return reranked_results - - except Exception as e: - print(f"重排失败: {e}") - print("将使用原始搜索结果") - # 返回原始结果,但转换为(结果, 分数)的元组格式 - return [(doc, doc.get('_score', 0.0)) for doc in results] - - def search_by_vector(self, query_embedding: list, k: int = 10) -> list: - """ - 根据向量进行相似性搜索 - - 参数: - query_embedding: 查询向量 - k: 返回的结果数量 - - 返回: - list: 搜索结果列表 - """ - try: - # 从连接池获取连接 - conn = self.es_pool.get_connection() - index_name = Config.ES_CONFIG['index_name'] - - # 执行向量搜索 - response = conn.search( - index=index_name, - body={ - "query": { - "script_score": { - "query": {"match_all": {}}, - "script": { - "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0", - "params": { - "query_vector": query_embedding - } - } - } - }, - "size": k - } - ) - - # 提取结果 - # 确保我们提取的是 hits.hits 部分 - if 'hits' in response and 'hits' in response['hits']: - results = response['hits']['hits'] - print(f"向量搜索结果数量: {len(results)}") - return results - else: - print("警告: 向量搜索响应格式不正确") - print(f"响应内容: {response}") - return [] - - except Exception as e: - print(f"向量搜索失败: {e}") - return [] - finally: - # 释放连接回连接池 - self.es_pool.release_connection(conn) - - def display_results(self, results: list, show_score: bool = True) -> None: - """ - 展示搜索结果 - - 参数: - results: 搜索结果列表 - show_score: 是否显示分数 - """ - if not results: - print("没有找到匹配的结果。") - return - - print(f"找到 {len(results)} 条结果:\n") - for i, item in enumerate(results, 1): - print(f"结果 {i}:") - try: - # 检查item是否为元组格式 (result, score) - if isinstance(item, tuple): - if len(item) >= 2: - result, score = item[0], item[1] - else: - result, score = item[0], 0.0 - else: - # 如果不是元组,假设item就是result - result = item - score = result.get('_score', 0.0) - - # 确保result是字典类型 - if not isinstance(result, dict): - print(f"警告: 结果项 {i} 不是字典类型,跳过显示") - print(f"结果项内容: {result}") - print("---") - continue - - # 尝试获取user_input内容 - if '_source' in result and 'user_input' in result['_source']: - content = result['_source']['user_input'] - print(f"内容: {content}") - elif 'user_input' in result: - content = result['user_input'] - print(f"内容: {content}") - else: - print(f"警告: 结果项 {i} 缺少'user_input'字段") - print(f"结果项内容: {result}") - print("---") - continue - - # 显示分数 - if show_score: - print(f"分数: {score:.4f}") - - # 如果有标签信息,也显示出来 - if '_source' in result and 'tags' in result['_source']: - tags = result['_source']['tags'] - if isinstance(tags, dict) and 'tags' in tags: - print(f"标签: {tags['tags']}") - - except Exception as e: - print(f"处理结果项 {i} 时出错: {str(e)}") - print(f"结果项内容: {item}") - print("---") - - def merge_results(self, keyword_results: List[Tuple[Dict, float]], vector_results: List[Tuple[Dict, float]]) -> List[Tuple[Dict, float, str]]: - """ - 合并关键字搜索和向量搜索结果 - - 参数: - keyword_results: 关键字搜索结果列表,每个元素是(文档, 分数)元组 - vector_results: 向量搜索结果列表,每个元素是(文档, 分数)元组 - - 返回: - list: 合并后的结果列表,每个元素是(文档, 分数, 来源)元组 - """ - # 标记结果来源并合并 - all_results = [] - for doc, score in keyword_results: - all_results.append((doc, score, "关键字搜索")) - for doc, score in vector_results: - all_results.append((doc, score, "向量搜索")) - - # 去重并按分数排序 - unique_results = {} - for doc, score, source in all_results: - doc_id = doc['_id'] - if doc_id not in unique_results or score > unique_results[doc_id][1]: - unique_results[doc_id] = (doc, score, source) - - # 按分数降序排序 - sorted_results = sorted(unique_results.values(), key=lambda x: x[1], reverse=True) - return sorted_results - - # 添加函数:保存学生信息到ES - def save_student_info_to_es(self,user_id, info): - """将学生信息保存到Elasticsearch""" - try: - # 使用用户ID作为文档ID - doc_id = f"student_{user_id}" - # 准备文档内容 - doc = { - "user_id": user_id, - "info": info, - "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - } - # 从连接池获取连接 - es_conn = self.es_pool.get_connection() - try: - # 确保索引存在,如果不存在则创建 - es_conn.index(index="student_info", id=doc_id, document=doc) - logger.info(f"学生 {user_id} 的信息已保存到ES: {info}") - finally: - # 释放连接回连接池 - self.es_pool.release_connection(es_conn) - except Exception as e: - logger.error(f"保存学生信息到ES失败: {str(e)}", exc_info=True) - - # 添加函数:从ES获取学生信息 - def get_student_info_from_es(self,user_id): - """从Elasticsearch获取学生信息""" - try: - doc_id = f"student_{user_id}" - # 从连接池获取连接 - es_conn = self.es_pool.get_connection() - try: - # 确保索引存在 - if es_conn.indices.exists(index=Config.ES_CONFIG.get("student_info_index")): - result = es_conn.get(index=Config.ES_CONFIG.get("student_info_index"), id=doc_id) - if result and '_source' in result: - logger.info(f"从ES获取到学生 {user_id} 的信息: {result['_source']['info']}") - return result['_source']['info'] - else: - logger.info(f"ES中没有找到学生 {user_id} 的信息") - else: - logger.info("student_info索引不存在") - finally: - # 释放连接回连接池 - self.es_pool.release_connection(es_conn) - except Exception as e: - # 如果文档不存在,返回空字典 - if "not_found" in str(e).lower(): - logger.info(f"学生 {user_id} 的信息在ES中不存在") - return {} - logger.error(f"从ES获取学生信息失败: {str(e)}", exc_info=True) - return {} - - def extract_student_info(self,text, user_id): - """使用jieba分词提取学生信息""" - try: - # 提取年级信息 - seg_list = jieba.cut(text, cut_all=False) # 精确模式 - seg_set = set(seg_list) - - # 检查是否已有学生信息,如果没有则从ES加载 - if user_id not in self.student_info: - # 从ES加载学生信息 - info_from_es = self.get_student_info_from_es(user_id) - if info_from_es: - self.student_info[user_id] = info_from_es - logger.info(f"从ES加载用户 {user_id} 的信息: {info_from_es}") - else: - self.student_info[user_id] = {} - - # 提取并更新年级信息 - grade_found = False - for grade, keywords in self.GRADE_KEYWORDS.items(): - for keyword in keywords: - if keyword in seg_set: - if 'grade' not in self.student_info[user_id] or self.student_info[user_id]['grade'] != grade: - self.student_info[user_id]['grade'] = grade - logger.info(f"提取到用户 {user_id} 的年级信息: {grade}") - # 保存到ES - self.save_student_info_to_es(user_id, self.student_info[user_id]) - grade_found = True - break - if grade_found: - break - - # 如果文本中明确提到年级,但没有匹配到关键词,尝试直接提取数字 - if not grade_found: - import re - # 匹配"我是X年级"格式 - match = re.search(r'我是(\d+)年级', text) - if match: - grade_num = match.group(1) - grade = f"{grade_num}年级" - if 'grade' not in self.student_info[user_id] or self.student_info[user_id]['grade'] != grade: - self.student_info[user_id]['grade'] = grade - logger.info(f"通过正则提取到用户 {user_id} 的年级信息: {grade}") - # 保存到ES - self.save_student_info_to_es(user_id, self.student_info[user_id]) - except Exception as e: - logger.error(f"提取学生信息失败: {str(e)}", exc_info=True) - - - diff --git a/dsSchoolBuddy/ElasticSearch/Utils/VectorDBUtil.py b/dsSchoolBuddy/ElasticSearch/Utils/VectorDBUtil.py deleted file mode 100644 index 2cb6ccd9..00000000 --- a/dsSchoolBuddy/ElasticSearch/Utils/VectorDBUtil.py +++ /dev/null @@ -1,125 +0,0 @@ -# pip install pydantic requests -from langchain_core.documents import Document -from langchain_core.vectorstores import InMemoryVectorStore -from langchain_openai import OpenAIEmbeddings -from langchain_text_splitters import RecursiveCharacterTextSplitter -from pydantic import SecretStr -import requests -import json -from Config.Config import ( - EMBED_MODEL_NAME, EMBED_BASE_URL, EMBED_API_KEY, - RERANK_MODEL, RERANK_BASE_URL, RERANK_BINDING_API_KEY -) - - -class VectorDBUtil: - """向量数据库工具类,提供文本向量化存储和查询功能""" - - def __init__(self): - """初始化向量数据库工具""" - # 初始化嵌入模型 - self.embeddings = OpenAIEmbeddings( - model=EMBED_MODEL_NAME, - base_url=EMBED_BASE_URL, - api_key=SecretStr(EMBED_API_KEY) # 包装成 SecretStr 类型 - ) - # 初始化向量存储 - self.vector_store = None - - def text_to_vector_db(self, text: str, chunk_size: int = 200, chunk_overlap: int = 0) -> tuple: - """ - 将文本存入向量数据库 - - 参数: - text: 要入库的文本 - chunk_size: 文本分割块大小 - chunk_overlap: 文本块重叠大小 - - 返回: - tuple: (向量存储对象, 文档数量, 分割后的文档块数量) - """ - # 创建文档对象 - docs = [Document(page_content=text, metadata={"source": "simulated_document"})] - doc_count = len(docs) - print(f"文档数量:{doc_count} 个") - - # 切割文档 - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=True - ) - all_splits = text_splitter.split_documents(docs) - split_count = len(all_splits) - print(f"切割后的文档块数量:{split_count}") - - # 向量存储 - self.vector_store = InMemoryVectorStore(self.embeddings) - ids = self.vector_store.add_documents(documents=all_splits) - - return self.vector_store, doc_count, split_count - - def query_vector_db(self, query: str, k: int = 4) -> list: - """ - 从向量数据库查询文本 - - 参数: - query: 查询字符串 - k: 要返回的结果数量 - - 返回: - list: 重排后的结果列表,每个元素是(文档对象, 可信度分数)的元组 - """ - if not self.vector_store: - print("错误: 向量数据库未初始化,请先调用text_to_vector_db方法") - return [] - - # 向量查询 - 获取更多结果用于重排 - results = self.vector_store.similarity_search(query, k=k) - print(f"向量搜索结果数量:{len(results)}") - - # 存储重排后的文档和分数 - reranked_docs_with_scores = [] - - # 调用重排模型 - if len(results) > 1: - # 准备重排请求数据 - rerank_data = { - "model": RERANK_MODEL, - "query": query, - "documents": [doc.page_content for doc in results], - "top_n": len(results) - } - - # 调用SiliconFlow API进行重排 - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {RERANK_BINDING_API_KEY}" - } - - try: - response = requests.post(RERANK_BASE_URL, headers=headers, data=json.dumps(rerank_data)) - response.raise_for_status() # 检查请求是否成功 - rerank_result = response.json() - - # 处理重排结果,提取relevance_score - if "results" in rerank_result: - for item in rerank_result["results"]: - doc_idx = item.get("index") - score = item.get("relevance_score", 0.0) - if 0 <= doc_idx < len(results): - reranked_docs_with_scores.append((results[doc_idx], score)) - else: - print("警告: 无法识别重排API响应格式") - reranked_docs_with_scores = [(doc, 0.0) for doc in results] - - print(f"重排后结果数量:{len(reranked_docs_with_scores)}") - except Exception as e: - print(f"重排模型调用失败: {e}") - print("将使用原始搜索结果") - reranked_docs_with_scores = [(doc, 0.0) for doc in results] - else: - # 只有一个结果,无需重排 - reranked_docs_with_scores = [(doc, 1.0) for doc in results] # 单个结果可信度设为1.0 - - return reranked_docs_with_scores - - diff --git a/dsSchoolBuddy/ElasticSearch/Utils/__init__.py b/dsSchoolBuddy/ElasticSearch/Utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/ElasticsearchCollectionManager.cpython-310.pyc b/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/ElasticsearchCollectionManager.cpython-310.pyc deleted file mode 100644 index 059711511b9f6f86065ee1d816861205a065ccb9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3488 zcmb7H-)|g89p9P#d6zreaot}EffWeCk;pU>RA5yFsY$Cy#YMGH#3|~s-Wl8L?CoA> z_ponuj!>F5F2p2=Hl@S~cA+GFXs85IDRI+R_!ssS_G~{z;(>=g*nDT^?tJG&3hZfS zXJ@|iotf|V^Zm^CqH?*Y!1IsCUzvMxSW*7TKl&dPKE4kneg-O{5ELjiR7Hrlae!-T zRTXVD(5f2M)^uvDqpJCWLNuaZP>B8-u3AK$QYywaXizFBE1wM9IPtxhy3Cv79g|V= zeAEcZNyZ{J>2bOKr?-cvp_eLEM3gEfh~NuKRV6CX;0>!01Kv6*lL9GTKvjbrASLKA zi3zI@WnUav_PHnssF(Or_yad|XDRb=-;%t|%X3g-ZlD%wDGr<-G~q@79YwqI?A)$*-=+@|j{eXG-m6gq#AN3|p zf{luMQz|&qVj9da9rp2o!c2I5{?dtwGbBD8v9s{{A&chWaL{_$ojuc8U`m+#%#RX3 zh`W8#1^3xvyxz<#$MHiyahw9EkGV05dU|AFv-i*I&KWQ!cOSQ*Fy=fo(!(t@ugrr< z=fRCDC_a>^YZ}4)`*_iHtfj2!EyP==0Y+1uT2m6E`7NX@VWgx7V4g{|#A@N! z&^6^c=F;audI_Jy3upmA*Eq{p@A%5jf4;W$>8H{;_cku&e>~s5vHGvSuH+y7CSSdk zFTa~FU(4_Otta1c7mXb+3PXvPeIDET>_+>S@Aqbat90S2)HubRY^PtbNkn5ij1rqJ z`f+m1_84^&Kb*Bgx*(yD9(vyO14`^9vV+Jaw%3E_ggsd)WG3kJJsM|*&=CvYXGRds z0(3KT!DS&R%*u|?O~G}rc2+pK=+Qa{d8ShyTbUXstfDdljFOcHj+G%QkR&UM3k>tE zz{$IjG6J3P7*tpvL1SnH89PJj7{bW<8Qw|9AA*g2%I<>*Us6D15*j?Zd1U(t;#wsJ zyycgfz3)l zRIQp@uY(!mtXOAJowCFSi)%@g8^+v)D@ri}RuhBSh zYxQb@)}|>Tu(b~`9GMrJs{%y0XMnLDaqsL>OP!3dcl7eKGGMYZ)X+m zF@8(5FuV=$BKX!vEuH&!$!KZpc*`J#Cjkpk_!~n}pH$Wq>333YX={iKt%DxWFwAFj z0!Z+hN=8~nmkT4+77PZC9`FcQP2eCv@)d#LU{!U#S(;qflwX+La_!2Wo*eh zX9+@iwDA<`Bb4@=H@DU=wLktizp~uEvI;=Dx3QdG`PKHt%SZFq|GfRy?fk=c?rpr; zrd~=wf3u*yzYq0CI(Tlv;6Y;_S<{6N_QuVbv9vd zeRi>Z=MP)!x3=!y%{MOZ*BAITKjBhu<=1X(-uX4KCa8i#?g=uQG+5YOOUm>{tEJdu zem!wwFJcf((@_*;TCEW$>36&+_Cq=aD+Dg{ljhguuE*tpWyBn_kMvG{q+%bnkB>i9 zE%i%4BpPwlU>?m3sQ`L(AprwoafrC106@4r{gle45kiDfA_GoOD67S3fd}+BuTsr^ z>1CV8c->9r#EkUt$wlfl1ba&(+Y5d&XN!d9D+T?HR%BlEw`n7#fmH zkL!V-u(FW$!jCUuPNxe`dOHm^`D! zXk3qgeHf?i{Y>y}=sON2o`4D$v4*jR@Gkre!3Lj0t8DyR8+`(J9-|?Qca0rwccd_a zhTzv@|F1rTC*&?Yo(psz&J{gs2;n0wbNYr*ML^yNLRD?4Jm@=Nikmi4_)laDYK!Lz zn#>h`EHVTLWxe?{gtFGj4Gdx=W=k6gWqnCIr-@KD8&~-by17An?UVM(?NcXYT(z%W zY_I>mH`oo9|2+}t%pj5H_yod2=sR^{Ph&4O?e>-p_ZdZU05+P6miAP6%x@q%GRIE&$rHKR?cZEP88{(~fnA7(# zml^U9?7Lj<0U*i_NKj5U;ix|Wp_t!%*c?GcR6?V0x%2_$8xH|WpQ;L;WM2bKux-dj zdA960SdP8*U&+7{6-RqrBqR8{^1k&+>{V$bA)0G?)ezZ^R|hu#jb1 z<>FZBn_{F=sFrE}zZAw3;OY|tedDy7UB!$WGc$QQ#?!C@82Lq!nz8S}fV47Sv2Pb* bQ+x-R^^82wvv9G8f^$M;fp`mT+qf4FX(37S=xi`?sax+ z>E&uliV6tm4~kzXnbGu;HAw|jwn8-wGO3T^DWavbNjzJjV`8zPI1^-F-_tz=W-J zSjRDgwi{aTB8V!j5|dOJNLp1uvZ@MF86hCmP^%^CI7(JYO|KKcgo0T%p-@87#`4oR zaiMYL)5hs@U1wRmb+LK&yXH@|*1Ey6C9E-H)DOa7(CAv+oIBGvRd0OrePeFEabc!+ z%llSPvQ29u933BT9#+OxqD? zwfXh+#^;xpuAK%Q!^1DSA>TgI%Trrm-Pcg@W-E?rsn=*#)m`a(VxY0{r4-9BTK z>9iTG?NJwHI+aJMa8x)JFL@)8I^^3&BF%KM%b6vw6qYQvRB_BQ(3#gFHE`TR0!b7J z6iE~i)8r}Hao@(iY#0^LCtL_$x{_?M;`S{djfu}8tmW*cIq^ObNeYWF0GG&T29-nx8$uwEQzS2@@cp~JTt(iO-p z!*eDLyOq<0d_rVE^K!(v{s@Fnl$^AL2a5&57Aa-FEFQr{m|E>viLxV3*b!7);Hyjy zoh4`*(j#|QR|oD?o?i{)ZWT5%y7J%!u?CvNcOC{3-!~`;WqQn_RE`j_9(q<&s}gJP znM$Hc5mySxdIKs66H_lyGR-KdNo(A2=Pn3>t-eS~3$U zuH#91;?!Ob8Y4}vk5YS&+D;j-Ik5qz1zL-gAQX@+GQZPprX%X~^4BjM*Rg`m!mtfA z1Q=>M%h4=lECbS@`V70r?qvUETJ|57-2C_Id2_$ouDn5BG3F$%AHzWM`f{8%Q2qv{ zspOEw9F8Y}LuKlP7KdsLieZ>|Fo)ytF9f+n4nZ%iC1Wk!Yyw+5AHbF@t{?rp^wkf| zlQYe^lQC6dBb>32*V~CAo`e~JFtu?PSi_JQxK~!-R}p@(3DX**>Adez<($hQjOLG^4a>j`@7T#c{?o$0sZ|{vy`lIk3LV zv=)!!Xfq1wP5XdyBrgMG3M*Wk)g(>zoUZBcWY!4$me->`r}JN4T#>QHMoFwCc$udl m#fM~F5zoSh7%|-4fl13`r0T}OM@7MQzzwbNWEw;3TJ^1@1_WIy|(>&a&fGnX~=49^bUS1c6t zteGno9w`>{$PR8OW=q3*!9@J}&3d-PjodE%zQGL7WKErK(f62rBl(<(I7-Afjp&8- z8+(Sg>x|_J0|wGUeR`JbW}is~{OF)RH<0w&+Cw?Rw0&DjBYEA{HsrEqmWEXS*MgUO z@H6s?$D?~_iBoFMlk(!HF@>oED)TZ8p^y0y1_l%s#4m(jSmZ`TZdBy9AT@?xD}M3G zC^n=m<<|oP9@hSvvQJ3`*%H>V-;)Zl^{kUE+pna;>|WNzmLnEn_p$5PO2ndU1G_-Ne4J-4d9vu`53oh6YY#%UB`aF?!s(WvOi~cr5Hl#KEyif}qmJVrq) z1#t@6C}^i(2?ZS#ETy26f@KI6h$$pjmWcZ2UfwUBGZ&v0&&uT6;yL@$S@|4)PCh3O z$^7RJ%jdcG#dGHTgnVASD4q-ZE<5Ra#j;`wGw0uRlAe*z3$Kc2<>PVr96u+X^JhPh z&o`fx=|?^i&zZ|p@_FfH@tiv)o1XpU^sFJje4ujd7lTE^G}a_a zhRzF_VSPfAK`{QgYd&=WZWtsBH)buh}nux=Qc0lmEL(Sj(P(^=xd zEn6PxOK^Q>NjFTYuwAEChcis4ZfM-LPmJCUZ_gOHY&ugi2kmHmZ}%^J2TPmO_;q;j zl@xppf!#Ws*+XmOuz$l2*XCMUgA(-&!QF`2@ne-|Ud77M4qSS%a%^Jmt#jRp-ekxQ z?#}Q6$pAanpUXo&xbGo%H#i+H+dH6}b^wVZkU2a-N&b9sU_j@#mMioZld7#5dcL3E zgsgWwJc-9=JsY~WF=K0y@4({`UL1nhLaZ-4h(_)$F*dr**=fngwh<#Uw$;yK-N>jN zOs8{&oS9DFfkq9Y`ItYZ_*K>GSNuv?QI+5q;XtQSUfNKzTC2B#Scu~gvk=7c9)wo; zcjXBjZqJl5?HN-og-0xyfNFG#8q3@o^1bVnmuhUqzgUT=CCNO} z=g~Z7%joYE&zMi~l)sB!V{Ex-V};pj`SvMT8O`b4Nw<8IM(wfdWSLzaehH&nNok^m zuThPyp2s}w`Z3=VzNTJp=yTF{E4v%Uv)hwQl)j+-OE`ZE2PY~YU2NbM&LMyP@Ni~i z1V?-BqZejh+3(hHBbB31&Axc9+kK}NYt+=8s2n}Ju=fzCX7=#G%9)qv-+WSb2I*2y zM<}rH(M$7ZUZ1&qW$yUfHKIH5>!y-!Pi|!X%JIs{R~m<~9hA|Xn0;Yl=JF&2R^`~6 zl6>7tC6a!XOMAg&odF4|?oQZI!z?kNKoCoRF%@*?4<<3fiz8rSIo+@|b96+v13UCl z;+1#>HncUCG4-U+4iGm@=a}sq;9ys&IM=h(o?vBwn?|ZF54=&(S$apIxI3@20o@Ll zMyNVVXG{Pu9o#X!3uIp8TT+4ichX?B!(e3VFlh9X8#Zj8SmjjM$PMT68J;soZGUk) zv4(fRL(I&8k&5=0@_Ax(CV^U2)aGl^1TbkxuL4(eEyi0IDVhBH$bCoQ-@~J{0zqYb zYT;+ES3dk@<@8Gc0PbESR)40fen%ArZEbszjh2EaF1IwXy0jWEs}q&+lQ>r{<7_xe zXmkF|$B;;+Z~f_If4}#s5`mifEkkd_h|t^X@6UhmMt35$Y&}E)rku-W2;TSX$?78% zDWy>kXKeoLld|mW(PMM3U+hlIpZ#$5=X+;geFL%#kR8l&=KN{QwWlwA-=^OCH$EU` zC-MI2Gc#9C;e^hPe+UUsdH!(a?9+2oPtKnHIkISYk2Ur2|F!kr`;&3L3O(2{2$DJ( zCPWFp89AJmo)eC-Lp=}Q+p{6PVdKMg0H-CF#mV*Sd!UTq$@IK#>b7r$=L)9nhu(tY zXSZ`CXK?sx;t|bM7tr?UoEN$6HK5`6`8TPj1ofm*aXV^eI1NjjOo(;KIcx>`5|tDI zZMNOY_3foxo~4~4KSZL@k3b8D)tEP~1ih-NLMT=Lwa?Y7{;O(>s;H_4L8U6%7k)@C zzv@@CKj8oWs8_YCvCf#%uEdqNmqJaAx5PeKp>!xwZxG@ws>Z#+z3L~iXnAFWfNL5| zmcVcIuR+XNNOeV=w|rX}ta^ws1|ECSw7Vl}`SvnY{+tIp0MtGS-~r(+_XipwM7}nmHtmX7ac5WV{3E;KZ`u`6 z7rPQDk&VQaEMOz^XJ3vy*!(ufHWk z@Fbd9IC#{Rm$%(|>#c56Ze%f$2>Lf666NSfnoqjL?T|Q7=}bNko?hGQ1m?x>Mj&K? ztXe;rD#z{ReJE5tEABB7E!jbwl@UliebLj>3bYHEMO~*M05k8QA$ejVI5C=f73lPf zj-X#@Ix&qT+|$(SSsvD)2iAnGG2tDkk5?JNj6KokkwGG`bW&u zho8WOG2%TAFLeuC5g&YC&IPtMbH&d2hPTuPs%EEJOrt`l?Z~> zCX6jVdlZT`+C@lopS29l@>c(c?cEsBnE;O zD&r^Sr{8vKfY%gDJgcYLfmTX+fpb{Z*rwXi5omu^oMH!ubu&XpVGDTMPq0}0-y=x1 zW*AEwkT8yXDG^(dOHes>#94*upQH0Cg}j`l6Hxsl^@}(=om7L2AOqkWKkEkpqPG>OHD(h93$KX9kwY_=0kPJwPOGmo4Fo1)_tdVMw zOw+MXrou!|q9vnw@STef>>tpoLD;IjMJR+U%RJBo!&aoKd+of#3QjGx!UL#BhMY0zbl?TiN!1El z0ZVA1V=ZW_liIRcSZu-z?X14F*80{65U{wpY@Y|(-1Dr>i5(ZK&_z51`d&LUH|Gbv zarx+PAedksFW-cAmsl~?Epv~$})FF7`j^w|)wf?cb z)@~2|CzqCBr*Gg{3ol-&XImh^%CYAlmSC@{T=@0u(dR1%-l&Y9YtSemvMR4$oVj?! zsrmGQ%9#t57se~+4_D5-GI!)$<;X{1IA6{D+;b*4pzMekRdqZr%K;BekP=kYK#4(2 z9h^J;vBR3&f{Sr8M|%|0SX6ryFswHf9ncFp&*a6)|Ez7*V{2CpuU*9ws~+rL^>Fv9 zzIVK~Z@37gZ3kha($giLw*#4xTpEgF*;g|A*WOvxq0}4`Y78xo;9`jdthuK|rmKXj ze_;*hKm67Fm4kBT-3ck=?nEJIhorm-P-utD9E@saW_W~mBk>&x68Q+?cCgA?N<^6n z*^<=J(4K&I5sBW?CD@U+NHi(eLO__R)W1Kuf=8%JN7<+ML9`XJ1*6fhc|8Lc_`hx{^1@wCR6TR-`A6Vbn`=;P`KS96W2eoG5y1y$DqI# za9P7iX$cHwjKO?vyX_rjcR<4_WU=OUXi(q7asyZfPOL}p(NwEYd(+h=FeG?}366Mj z4FxA#C9BO*MX{_UiqYbTtxrjiD_J_{NVdbSLT%Ve&i08ss9D7_RlUGQN)Sjo9_WC=)CD9BAMo zcyI;Kc84E|)YaJMF(s;WkWzIOX#4XHUrf=iHnH`^`xp4F0eq8$a*+{f3X+e5_j*m@ z->_sc1sqtl)G5E<;Nb5-J!;Z&k`>Hfes6Z-wVCr5B@_8-4j}ZLDvp)) zYZVqFW(iGX(*Pw<_}y{v9T*kFDosh8VqT`$;}p=DlGMzA9C$h zk)C#UzggTaA&t1H%m$FQkhLm-T(ObifTola4YGETigrLMUPgn2ZNoTR3HzVWwxt?D z6L+>GB?f?a%I60>V_y6OZWWU8Hb_Pv@TzJ7v+h&*?F=!so}p{dxk2crp1MIp<> zm|29iT49X&T`SC5mwO=lZ-%UnPPMW)a72rUwXt@fh8SdMiv}#XL*XaUKWwZ!JpAnf zu6TAnWocFmq&#Fc=Gkh+`h}9W^Ql4vGwBdBSt@3t(M-lDwG(YbYIxs@(EK2` z+qlUtccIiWD_)h`cRRsoh7{xhk=%nNEXQ>-9H>Y3}%jq*&6M( z?PI0k5yOu2Ob)2FzsN~3H%o>%m%@yQ@zvy{*yEp~-&D)4Og_h|Cy(z(nmE6yh;t^z znRtnkQ=)vsQKg#{I=Q$dan3F-O-+3`6_6-v57c#{6#ghRu$XN!*3h2u7}nF+hQeAT z2ssSfGJsWFxgJ*Js1jGnh8)CG^9oo6V5Nq_ivJH?z93|_2CH(E?8?eYyu}n1;3^o1 zsqL^y$J7|oR;U16UzD%=3gY93(7UbdXprYYq`C6kk1>`XFaOvzb9zlx8k}Lu^bWzu z%wTvXqbFIT$No%-g0I6 zNm~{DLQ0d&C%Mg{B)jlz5jT`V6CFUg=8}+r!O_Pd0RxbLohTo(0#yk}<%PU!9Sb}W zXs*dbDoMZkz99?arN5_@smiF?sS7ZuI>1hdy}6U;XD(fu-TRiLx<5hw)o@!CQaTv4 z+6g^()R|(XAdp<3*8a6|XFdHlS;eF#5jCHxxArvf7H0T@(7WjaQ`y|{e!TcQ2 zKJovdqazBQ0J@F@|2JGFlvse&s<;xP8>P>2f$_Bl8gK4riybxwi@W6>H;_7!c?3Vw zP>oM8980M|cLtjzz@it%WJR3nRC zqN0};7x2ewOaQs9#2U7Gb}nOa!E7F9ZD28R!E7FvG1dlV6F}Ma8apVgqH#~cLehmputpD9omQ&{SPvuQ6QP1nK8dlR zu2?Iq6!ti|qBM^EcNtsd{THF7)>C1P(=3MMzY3|>^Re0)1hbP)5ifm=o+RRSyBC&q z2htvbz3g1&@CCX6hsTD1K)B`ZEfJm0irhqr91at+r>13gcbM-)f+!(V5}zsE3xq_a zwMyW+=je5U$_W%soC6~g&S`V+e+0e81>&<}jQj}7mc47&uH}cRD&2hwmjjo-EJsRW z?$X7|x#w2bJQ`M`&Dk@5BZ~^nqB~LcuKRAkFzbk9X%_^Gi=&0g>-*=9JuSyAlmt1s z6Thw;ytr`uXWa>Ks~S@x_{Hgx)ApI<-75jmNlJZ?g13`P|#^g6|m?$ye2u-s=xtJvosiFpx|)Pe+FI#jy!R+X>{YmWde04K2e5D-DG z7k~yaz!~JpG!40w-j085Z%pk{R*3u9uvh!fO;_YiUE8gW=Sy`gc1#MT>id^?oFC&9 zJZ=&ZGDcWro{xfi5x4P9fhU0?m3{#;c4}lO_)@OwAnxR~<(?fHzlU140{t*;Si!qIJHG**=$}%jVY*;`IE&+oPMQ_O z1>J5etV;})W8<)BR}TDY;W+tDIx-J#A9a#Uo|dns(WV81S-P=F*@i?2*Hn>R<+w>R zCL(X#0JQ(50sb2scbDMW;Ko@TasBphn!p;ci#KL39!oY&M3|(TRz|Rw`q|iCvZxY| zX;7%JYaU`Uq*faQp)DxV%ki32dqruX>%ret+0&%;*}=Q=nc?j$bN4N1K-}g9J>@kG zA|0k?+)*x~GRulvpxJt{$7G4d&h zUUBV-HqbG#d{-^`HD1a;fKw;AE|RAOlnoQkpNv9K7JQ(+ljH*f?-+5t2aII-hI}v? zk}D;y>7YcA_Y2H`KsEtOMLQZP8A2WKPQfQezA?B@idLMI-DF$~jiw-np&ON#nEs(K z3!0It>LPrvVayZPW05ic6L1%WNlnB^_9mmHRTw<<^vdZs=Z?QRb7gw=$+Ph4A{V!& zJ1e@ikj{X72z9Wj-@!w;SfwJ;V=pZ(GS*PCf#6eGCE0L7KhVesMfs9$n1hqrp_%=-cQ_4hl{S^RTbzF;2F^PU&DUfML?D$ zaLjHdd2+ST`sssjU0NUm$A3J4H_`CVN)COO~Gmn`>+>2 zzMGH=FN|s$v|z{a2imXC(nt#y&iK`%C*UxHxTqdM36Lqp(M{nmEDs|9Il1wN#GwsK z|KCEZNQ=0K*E8jo&w>9mw?S!Q4o)d?pu?kY;y{yk@ETabBjgZ9wrz~{sA#=K^ca&p z=Fq02Kf_Us#x&G|yIfI=$dSR)O}@&41X{^$iTZ?I?YNaCmy~YA3eS9Wq~V5k{>X=w zqd%>u3Vs>I?DmGO7v6pRE@bc`1q8=ROA%0WeNGzyjp9frJ{9=Mr2*V9uBP%`2zvV9 z9@T&|WfgG`S|b}_RjVIC)wOCu`lQ;SLeYc~sswx2$JDmCGIe`|*Q)3U2ve5tm2_7( z*m~k~Jmt3VD`+Yit0S;`ksxmpiNvLjN_N;qiH_*a_T`Ja;V?-ASoMt(v`1{5{|5@L zX{hR=%b)s5r~7$v7VoEg^;D%(?F3uD9zWa#ejdebpca6WqfZOO{sk(k zK!rEbCDUH@v+}A2O1xGd4O~BlzFboLJ0iv3(04J?Aqs{in4{JyIG^bZMk_ah7CMb=1#TY|!#1<~Qra>$$Nb;nh z?EHb{=Oq!l!}1%KNS@@wV_`UuL>NA5fKD8-BK!)jh@vctS>l=rZMQ%Q!k4q!Ge-7G zHSUh=ZxKnHOx_Qte_k2?*_S#Yob%<;S4F9$=(z0@bG5yg5FD9xlMC7XA5phP$#(34&FeSx zkfXuYO`A9L!P_2oW`qANO4%xm$fS3Xe}Azw(kx|TQr)|i8YFTDhR_GnbSah!Pz+t# zQ4;4>l1@d$cLELzqcyaHj=Es*Td0}eQSb%*dwx&yv}^tEaO>lh%GLA6yi zMlQHsa>fk<0pbZCFH#B<5Z#?hJ2~kp>c40~gjY2U_H9uz?BsW;{!jgy-y8Eeg8l0a z>}0XNlL|h(<;3mr*h&wTZ^J`ud{Wtxyn#MTqL0kdX*--w!&|15r+74tOY={(# zAy_RgkSMDH}-AWwE400=1q_GZs_CxKp7z@EUE9pP}F_3f`mOmlTj{!9x@fTeN-s`C`W8mneab z2Nx&lzfi1(%th>5lt#wBPW+4w2!b9BTYx2nNs*Rb!?P~_i54ewq;@r^g*(HW!tE<> z=;~_EbS~}M*mcKUcL%Qzc12f;7LzOaYiQ37(6?mxT8DofZ~O@g1YrDoioJ>;bxXsS zV(Y}0V(S`E0Iu)qzKa&0t);s1nZiJJFq135O_S^E+#5mRxLBXv;D%^Om&SOlHol-K zWOCB)Cpa?7GW3;(^tB+VYWwkF4UNBPkwXww4~fs@_|w(_>NvG@K> o(idrwHHUg9kJa6fULNB{1VIm-dhrj6h{9x2{qGP4ly>j`1x0M(xc~qF diff --git a/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/VectorDBUtil.cpython-310.pyc b/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/VectorDBUtil.cpython-310.pyc deleted file mode 100644 index 33f9531bb716529a650554ebca6d1cfbf90a3bae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4136 zcmaJ^?Qayv8K2p=+k3Om#t>*o;ty$?lc)cYT~td4&(_ zyeZ^GMGWhLDdl5IEUb&BoR2Hxk5gvWZ1l1Il7w?kwcoyly#AU)OEz*7F5A@>@OU6lgCz zIy7fqDe6{7cUS(PPKaUk+tBvxUOneh+Zfh&4ro-%IXc~^A8~q%rr|&xG~&H_j_OXY zLwmB4pXu7YqpMTh{Yq!oF14p)cbDG~we0BV?NVReyUR~U?;U%()t9?o_fvbj_IC8V z9F6cBt8aDf4Yh{up3d%`m(=R89NX3M5{H);;jwpt(2;@(QaFMYo^XU8LyAC}iKL4p zrb|SIC$7i(WRf7sF;S7pI+BLIhOj-}Cy_>yf%XJx0uQ$MsmM*8J6?4R)9r!@f-`3q zPfS$ayImRoV{rdmaPL}h?B4vH2VXxtRhc+5_t#663nwa*pG9iHsSCla_k*`?1n-`z zT)DO|`%&=rm4(r-au_Vt3ag9o?1aaD7KDQw>~Ie62&0(bao$6!NH~xZ;b2v~%anqX zQ^6$P?q<_e1YhuA|KRjU3ICc#I&NJ4(TI3*%%eT0*x^tGa)g-{OBk2It2XG zaEKAru7X6cb+hj#q5ypK@2<}F1H|56phNJthZY7w7izt^fkMID;Sw@(pv%;3$H?{S z8qEzHU>>v&Eca(yam1HZ)vydlRhdyc1%l%_%wY-3|4B%t^(*vix{H4UJ<9y%3l2ci zPviz%Ye=x}&Y0>T4?Imc0~3Cn>&Iom!5(w*ZIj^>;R$ zxud;x;rj2`!29RsKOBoptx#I>R(or8rjy5m-+vxVT?{_{B)C1XhIZAT^#M~C7f*}_ z6VufJ^$`n?&Igw+uUeE_H1+n@7pkO)7%}_h!kw8d;rugKVIRP%Ev-Scjx(pL3!k|P zYl=Fv^(lPGF1R$OE6ui%cTLUFiAriAj`LGRtzTDj1U;qd>~J>Nq-!}! znjN|tnufTnBs9~E-fTY^O&IF=yg}@cypdQ3X8opctW!{{Tqg&gfo7Yzp*bz%4CZ(V zOaDvoGH$>r9v-gr>$`fR?<*m>faL--$+XwJ$a)ltp$c zI>6s4jwB|zvQ&D_kp^Rq>`6|1I^l^km?VM8F)ucqoIx1De|!d&Ws>sbVMI$F)Hb0` z^v3hR+4yvdq=D^;u+~5tXSi~bWW0occr=7*KAL$5(IXaz(V=~wFrD^zFR=-E$v$od zc6ex?m4qENvE8zJskUYoamYFt4^gMK&;7^~dXa#fhLJyGR8C=3+5}oHUaF=6Z|g%# z8dw=T|qc75)P&+9`VHnk3gktvE0K=Ano=PH*cBX%!EwY36>Y1k0eAmCI^zZ)?y z8XKIvxG-~rUA?mtz~bQ8==|O3%KNh~L_80wF(#OpoqIGB+`kUt1*Xfw+)VYsSbcSF zH-K{S{09rOv+b>`T`>7+{j!Cr7H01S$4)NX9-Tjb<1y@4E__+}(|cJ=e+>5Ar$JPW zotd1!`_b|ZL(EuYH`t0)fJF^PmtJ^mb|dx!9o`Vj*u#V9EO!X5|B$;d0keIR^SQRI zJ`Vw5HLnxfn9p*Bd~y40%owxWj7nQqP&YiXCFZwnGYdJ*w6{a0HaIKL1{g|z!~_cs zis%%Is#VHpMTliN%`pnrPY3OSRf_F^i(aR#S+11Y8Aacp>>DYgNGB3xt9ee=H72wK!om#!PVeod`F<$9ER)UTbfS5OVwY z+*gkPO~L6akkVC_LFIc_Pk#<}=`WaIFrnYK&SBR1OAtLzHkHNwpakV#zj?C35zq+@ zW*fsRi#`VpN)uefhN)4*8By&p-J>i=r7YA_melz&)d6ib9I{X>*bYsyMuFkR7m1rM z+De9M24t#z17aHdNM@z)xVVg^|8n96EVCbvZJnpy3MsVvh_RmzCq^Kd4RAV9(-IxJhG&$Cv6#R1yECJ14+Bv|r5*=^2}7n_~4 zj07xmo0FTL3){0T%0^SO`f~%CVL_ro^){F;{0z0j>_~ZfRdg`kg8H+IGeGoX5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Hci&acYad>`F zHjoM^%FjwoE&;NFlQZ)3bDT<3QYvFya}tY7GLwT-6N{2FVnRzYbBbf)<1_OzOXB18 Y3My}L*yQG?l;)(`fs8I@0un3?0N)5E#sB~S diff --git a/dsSchoolBuddy/ElasticSearch/__init__.py b/dsSchoolBuddy/ElasticSearch/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dsSchoolBuddy/ElasticSearch/__pycache__/__init__.cpython-310.pyc b/dsSchoolBuddy/ElasticSearch/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 94ae6ce2baf4d4f283db992edbc385bb28d4e8c7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 144 zcmd1j<>g`kf+(K23=sVoL?8o3AjbiSi&=m~3PUi1CZpdSknE3e2yv&mLc)fzkTO2mI`6;D2sdga4 KikW}}3j+WHkssOs diff --git a/dsSchoolBuddy/Start.py b/dsSchoolBuddy/Start.py deleted file mode 100644 index 2b67a2f2..00000000 --- a/dsSchoolBuddy/Start.py +++ /dev/null @@ -1,212 +0,0 @@ -# pip install jieba -import json -import logging -import time -import jieba -import fastapi -import uvicorn -from fastapi import FastAPI, HTTPException -from openai import AsyncOpenAI -from sse_starlette import EventSourceResponse -import uuid - -from Config import Config -from ElasticSearch.Utils.EsSearchUtil import EsSearchUtil - -# 初始化日志 -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - -# 初始化异步 OpenAI 客户端 -client = AsyncOpenAI( - api_key=Config.ALY_LLM_API_KEY, - base_url=Config.ALY_LLM_BASE_URL -) - -# 初始化 ElasticSearch 工具 -search_util = EsSearchUtil(Config.ES_CONFIG) - -async def lifespan(_: FastAPI): - yield - -app = FastAPI(_=lifespan) - -@app.post("/api/teaching_chat") -async def teaching_chat(request: fastapi.Request): - """ - 根据用户输入的语句,查询相关历史对话 - 然后调用大模型进行回答 - """ - try: - data = await request.json() - user_id = data.get('user_id', 'anonymous') - query = data.get('query', '') - session_id = data.get('session_id', str(uuid.uuid4())) # 获取或生成会话ID - include_history = data.get('include_history', True) - - if not query: - raise HTTPException(status_code=400, detail="查询内容不能为空") - - # 1. 初始化会话历史和学生信息 - if session_id not in search_util.conversation_history: - search_util.conversation_history[session_id] = [] - - # 检查是否已有学生信息,如果没有则从ES加载 - if user_id not in search_util.student_info: - # 从ES加载学生信息 - info_from_es = search_util.get_student_info_from_es(user_id) - if info_from_es: - search_util.student_info[user_id] = info_from_es - logger.info(f"从ES加载用户 {user_id} 的信息: {info_from_es}") - else: - search_util.student_info[user_id] = {} - - # 2. 使用jieba分词提取学生信息 - search_util.extract_student_info(query, user_id) - - # 输出调试信息 - logger.info(f"当前学生信息: {search_util.student_info.get(user_id, {})}") - - # 为用户查询生成标签并存储到ES - current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - tags = [user_id, f"time:{current_time.split()[0]}", f"session:{session_id}"] - - # 提取查询中的关键词作为额外标签 - 使用jieba分词 - try: - seg_list = jieba.cut(query, cut_all=False) # 精确模式 - keywords = [kw for kw in seg_list if kw.strip() and kw not in search_util.STOPWORDS and len(kw) > 1] - keywords = keywords[:5] - tags.extend([f"keyword:{kw}" for kw in keywords]) - logger.info(f"使用jieba分词提取的关键词: {keywords}") - except Exception as e: - logger.error(f"分词失败: {str(e)}") - keywords = query.split()[:5] - tags.extend([f"keyword:{kw}" for kw in keywords if kw.strip()]) - - # 存储查询到ES - try: - search_util.insert_long_text_to_es(query, tags) - logger.info(f"用户 {user_id} 的查询已存储到ES,标签: {tags}") - except Exception as e: - logger.error(f"存储用户查询到ES失败: {str(e)}") - - # 3. 构建对话历史上下文 - history_context = "" - if include_history and session_id in search_util.conversation_history: - # 获取最近的几次对话历史 - recent_history = search_util.conversation_history[session_id][-search_util.MAX_HISTORY_ROUNDS:] - if recent_history: - history_context = "\n\n以下是最近的对话历史,可供参考:\n" - for i, (user_msg, ai_msg) in enumerate(recent_history, 1): - history_context += f"[对话 {i}] 用户: {user_msg}\n" - history_context += f"[对话 {i}] 老师: {ai_msg}\n" - - # 4. 构建学生信息上下文 - student_context = "" - if user_id in search_util.student_info and search_util.student_info[user_id]: - student_context = "\n\n学生基础信息:\n" - for key, value in search_util.student_info[user_id].items(): - if key == 'grade': - student_context += f"- 年级: {value}\n" - else: - student_context += f"- {key}: {value}\n" - - # 5. 构建提示词 - system_prompt = """ - 你是一位平易近人且教学方法灵活的教师,通过引导学生自主学习来帮助他们掌握知识。 - - 严格遵循以下教学规则: - 1. 基于学生情况调整教学:如果已了解学生的年级水平和知识背景,应基于此调整教学内容和难度。 - 2. 基于现有知识构建:将新思想与学生已有的知识联系起来。 - 3. 引导而非灌输:使用问题、提示和小步骤,让学生自己发现答案。 - 4. 检查和强化:在讲解难点后,确认学生能够重述或应用这些概念。 - 5. 变化节奏:混合讲解、提问和互动活动,让教学像对话而非讲座。 - - 最重要的是:不要直接给出答案,而是通过合作和基于学生已有知识的引导,帮助学生自己找到答案。 - """ - - # 添加学生信息到系统提示词 - if user_id in search_util.student_info and search_util.student_info[user_id]: - student_info_str = "\n\n学生基础信息:\n" - for key, value in search_util.student_info[user_id].items(): - if key == 'grade': - student_info_str += f"- 年级: {value}\n" - else: - student_info_str += f"- {key}: {value}\n" - system_prompt += student_info_str - - # 6. 流式调用大模型生成回答 - async def generate_response_stream(): - try: - # 构建消息列表 - messages = [{'role': 'system', 'content': system_prompt.strip()}] - - # 添加学生信息(如果有) - if student_context: - messages.append({'role': 'user', 'content': student_context.strip()}) - - # 添加历史对话(如果有) - if history_context: - messages.append({'role': 'user', 'content': history_context.strip()}) - - # 添加当前问题 - messages.append({'role': 'user', 'content': query}) - - stream = await client.chat.completions.create( - model=Config.ALY_LLM_MODEL_NAME, - messages=messages, - max_tokens=8000, - stream=True - ) - - # 收集完整回答用于保存 - full_answer = [] - async for chunk in stream: - if chunk.choices[0].delta.content: - full_answer.append(chunk.choices[0].delta.content) - yield f"data: {json.dumps({'reply': chunk.choices[0].delta.content}, ensure_ascii=False)}\n\n" - - # 保存回答到ES和对话历史 - if full_answer: - answer_text = ''.join(full_answer) - search_util.extract_student_info(answer_text, user_id) - try: - # 为回答添加标签 - answer_tags = [f"{user_id}_answer", f"time:{current_time.split()[0]}", f"session:{session_id}"] - try: - seg_list = jieba.cut(answer_text, cut_all=False) - answer_keywords = [kw for kw in seg_list if kw.strip() and kw not in search_util.STOPWORDS and len(kw) > 1] - answer_keywords = answer_keywords[:5] - answer_tags.extend([f"keyword:{kw}" for kw in answer_keywords]) - except Exception as e: - logger.error(f"回答分词失败: {str(e)}") - - search_util.insert_long_text_to_es(answer_text, answer_tags) - logger.info(f"用户 {user_id} 的回答已存储到ES") - - # 更新对话历史 - search_util.conversation_history[session_id].append((query, answer_text)) - # 保持历史记录不超过最大轮数 - if len(search_util.conversation_history[session_id]) > search_util.MAX_HISTORY_ROUNDS: - search_util.conversation_history[session_id].pop(0) - - except Exception as e: - logger.error(f"存储回答到ES失败: {str(e)}") - - except Exception as e: - logger.error(f"大模型调用失败: {str(e)}") - yield f"data: {json.dumps({'error': f'生成回答失败: {str(e)}'})}\n\n" - - return EventSourceResponse(generate_response_stream()) - - except HTTPException as e: - logger.error(f"聊天接口错误: {str(e.detail)}") - raise e - except Exception as e: - logger.error(f"聊天接口异常: {str(e)}") - raise HTTPException(status_code=500, detail=f"处理请求失败: {str(e)}") - - - -if __name__ == "__main__": - uvicorn.run(app, host="0.0.0.0", port=8000)