diff --git a/dsLightRag/Test/G2_TeachingStudent.py b/dsLightRag/Test/G2_TeachingStudent.py index c819af6c..8470248c 100644 --- a/dsLightRag/Test/G2_TeachingStudent.py +++ b/dsLightRag/Test/G2_TeachingStudent.py @@ -3,45 +3,39 @@ import sys from Util import LlmUtil -def initialize_chat_history(): - """初始化对话历史,包含系统提示""" - system_prompt = """ - STRICT RULES -Be an approachable-yet-dynamic teacher,who helps the user learn by guiding -them through their studies. - -1.Get to know the user.lf you don't know their goals or grade level,ask the -user before diving in.(Keep this lightweight!)If they don't answer,aim for -explanations that would make sense to a10th grade student. - -2.Build on existing knowledge.Connect new ideas to what the user already -knows. - -3.Guide users,don't just give answers.Use questions,hints,and small steps -so the user discovers the answer for themselves. - -4.Check and reinforce.After hard parts,confirm the user can restate or use the -idea.Offer quick summaries,mnemonics,or mini-reviews to help the ideas -stick. - -5.Vary the rhythm.Mix explanations,questions,and activities(like roleplaying, -practice rounds,or asking the user to teach you) so it feels like a conversation, -not alecture. - -Above all:DO NOT DO THE USER'S WORK FOR THEM. Don't answer homework questions - Help the user find the answer,by working -with them collaboratively and building from what they already know. +def get_system_prompt(): + """获取系统提示""" + return """ + 你是一位平易近人且教学方法灵活的教师,通过引导学生自主学习来帮助他们掌握知识。 + + 严格遵循以下教学规则: + 1. 首先了解学生情况:在开始讲解前,询问学生的年级水平和对勾股定理的了解程度。 + 2. 基于现有知识构建:将新思想与学生已有的知识联系起来。 + 3. 引导而非灌输:使用问题、提示和小步骤,让学生自己发现答案。 + 4. 检查和强化:在讲解难点后,确认学生能够重述或应用这些概念。 + 5. 变化节奏:混合讲解、提问和互动活动,让教学像对话而非讲座。 + + 最重要的是:不要直接给出答案,而是通过合作和基于学生已有知识的引导,帮助学生自己找到答案。 """ - return [{"role": "system", "content": system_prompt}] + + +def initialize_chat_history(): + """初始化对话历史""" + # 包含系统提示作为第一条消息 + return [{ + "role": "system", + "content": get_system_prompt() + }] if __name__ == "__main__": - # 初始化对话历史 + # 初始化对话历史(包含系统提示) chat_history = initialize_chat_history() - + # 欢迎消息 print("教师助手已启动。输入 'exit' 或 '退出' 结束对话。") print("你可以开始提问了,例如: '讲解一下勾股定理的证明'") - + # 多轮对话循环 while True: # 获取用户输入 @@ -55,16 +49,19 @@ if __name__ == "__main__": # 添加用户输入到对话历史 chat_history.append({"role": "user", "content": user_input}) - # 发送请求(传递完整对话历史) + # 发送请求(传递用户输入文本和系统提示) print("\n教师助手:") try: - # 调用LlmUtil获取响应,传递对话历史 - response_content = LlmUtil.get_llm_response(user_input, chat_history) - + # 调用LlmUtil获取响应,传递用户输入文本和系统提示 + response_content = LlmUtil.get_llm_response( + user_input, + system_prompt=get_system_prompt() + ) + # 打印响应 print(response_content) - - # 添加助手回复到对话历史 + + # 维护对话历史(仅本地记录,不传递给API) chat_history.append({"role": "assistant", "content": response_content}) except Exception as e: print(f"发生错误: {str(e)}") diff --git a/dsLightRag/Util/LlmUtil.py b/dsLightRag/Util/LlmUtil.py index 12db6a5c..e00c2a87 100644 --- a/dsLightRag/Util/LlmUtil.py +++ b/dsLightRag/Util/LlmUtil.py @@ -54,11 +54,12 @@ async def get_llm_response_async(query_text: str, stream: bool = True): yield f"处理请求时发生异常: {str(e)}" # 保留原同步函数 -def get_llm_response(query_text: str, stream: bool = True): +def get_llm_response(query_text: str, stream: bool = True, system_prompt: str = 'You are a helpful assistant.'): """ 获取大模型的响应 @param query_text: 查询文本 @param stream: 是否使用流式输出 + @param system_prompt: 系统提示文本,默认为'You are a helpful assistant.' @return: 完整响应文本 """ client = OpenAI( @@ -70,7 +71,7 @@ def get_llm_response(query_text: str, stream: bool = True): completion = client.chat.completions.create( model=LLM_MODEL_NAME, messages=[ - {'role': 'system', 'content': 'You are a helpful assistant.'}, + {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': query_text} ], stream=stream diff --git a/dsLightRag/Util/__pycache__/LlmUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/LlmUtil.cpython-310.pyc index 1ddb25a0..bd138d45 100644 Binary files a/dsLightRag/Util/__pycache__/LlmUtil.cpython-310.pyc and b/dsLightRag/Util/__pycache__/LlmUtil.cpython-310.pyc differ diff --git a/dsRag/.idea/RAG.iml b/dsRag/.idea/RAG.iml index f1d90d5d..2ef181ef 100644 --- a/dsRag/.idea/RAG.iml +++ b/dsRag/.idea/RAG.iml @@ -4,7 +4,7 @@ - + diff --git a/dsRag/.idea/misc.xml b/dsRag/.idea/misc.xml index 913c162f..4ccb922a 100644 --- a/dsRag/.idea/misc.xml +++ b/dsRag/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/dsSchoolBuddy/.idea/.gitignore b/dsSchoolBuddy/.idea/.gitignore new file mode 100644 index 00000000..a7cdac76 --- /dev/null +++ b/dsSchoolBuddy/.idea/.gitignore @@ -0,0 +1,8 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml +# 基于编辑器的 HTTP 客户端请求 +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/dsSchoolBuddy/.idea/RAG.iml b/dsSchoolBuddy/.idea/RAG.iml new file mode 100644 index 00000000..2ef181ef --- /dev/null +++ b/dsSchoolBuddy/.idea/RAG.iml @@ -0,0 +1,12 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/dsSchoolBuddy/.idea/inspectionProfiles/profiles_settings.xml b/dsSchoolBuddy/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 00000000..105ce2da --- /dev/null +++ b/dsSchoolBuddy/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/dsSchoolBuddy/.idea/jsLibraryMappings.xml b/dsSchoolBuddy/.idea/jsLibraryMappings.xml new file mode 100644 index 00000000..1bde7079 --- /dev/null +++ b/dsSchoolBuddy/.idea/jsLibraryMappings.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/dsSchoolBuddy/.idea/misc.xml b/dsSchoolBuddy/.idea/misc.xml new file mode 100644 index 00000000..4ccb922a --- /dev/null +++ b/dsSchoolBuddy/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/dsSchoolBuddy/.idea/modules.xml b/dsSchoolBuddy/.idea/modules.xml new file mode 100644 index 00000000..23b163c9 --- /dev/null +++ b/dsSchoolBuddy/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/dsSchoolBuddy/.idea/vcs.xml b/dsSchoolBuddy/.idea/vcs.xml new file mode 100644 index 00000000..61aaccdb --- /dev/null +++ b/dsSchoolBuddy/.idea/vcs.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/dsSchoolBuddy/Config/Config.py b/dsSchoolBuddy/Config/Config.py new file mode 100644 index 00000000..94344ebc --- /dev/null +++ b/dsSchoolBuddy/Config/Config.py @@ -0,0 +1,27 @@ +# Elasticsearch配置 +ES_CONFIG = { + "hosts": "https://127.0.0.1:9200", + "basic_auth": ("elastic", "jv9h8uwRrRxmDi1dq6u8"), + "verify_certs": False, + "ssl_show_warn": False, + "index_name": "ds_kb" +} + +# 嵌入向量模型 +EMBED_MODEL_NAME = "BAAI/bge-m3" +EMBED_API_KEY = "sk-pbqibyjwhrgmnlsmdygplahextfaclgnedetybccknxojlyl" +EMBED_BASE_URL = "https://api.siliconflow.cn/v1" +EMBED_DIM = 1024 +EMBED_MAX_TOKEN_SIZE = 8192 + +# 重排模型 +RERANK_MODEL = 'BAAI/bge-reranker-v2-m3' +RERANK_BASE_URL = 'https://api.siliconflow.cn/v1/rerank' +RERANK_BINDING_API_KEY = 'sk-pbqibyjwhrgmnlsmdygplahextfaclgnedetybccknxojlyl' + +# 阿里云API信息【HZKJ】 +ALY_LLM_API_KEY = "sk-01d13a39e09844038322108ecdbd1bbc" +ALY_LLM_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1" +ALY_LLM_MODEL_NAME = "qwen-plus" +#ALY_LLM_MODEL_NAME = "deepseek-r1" +# ALY_LLM_MODEL_NAME = "deepseek-v3" \ No newline at end of file diff --git a/dsSchoolBuddy/Config/__init__.py b/dsSchoolBuddy/Config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsSchoolBuddy/Config/__pycache__/Config.cpython-310.pyc b/dsSchoolBuddy/Config/__pycache__/Config.cpython-310.pyc new file mode 100644 index 00000000..82eb2f5e Binary files /dev/null and b/dsSchoolBuddy/Config/__pycache__/Config.cpython-310.pyc differ diff --git a/dsSchoolBuddy/Config/__pycache__/__init__.cpython-310.pyc b/dsSchoolBuddy/Config/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 00000000..b3f99ca7 Binary files /dev/null and b/dsSchoolBuddy/Config/__pycache__/__init__.cpython-310.pyc differ diff --git a/dsSchoolBuddy/Doc/1、python环境配置.png b/dsSchoolBuddy/Doc/1、python环境配置.png new file mode 100644 index 00000000..ffc55842 Binary files /dev/null and b/dsSchoolBuddy/Doc/1、python环境配置.png differ diff --git a/dsSchoolBuddy/Doc/2、Conda维护.txt b/dsSchoolBuddy/Doc/2、Conda维护.txt new file mode 100644 index 00000000..021764bf --- /dev/null +++ b/dsSchoolBuddy/Doc/2、Conda维护.txt @@ -0,0 +1,30 @@ +# 添加Anaconda的TUNA镜像 +conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ + +# 设置搜索时显示通道地址 +conda config --set show_channel_urls yes + +# 创建虚拟环境 +conda create -n rag python=3.10 + +# 查看当前存在哪些虚拟环境 +conda env list +conda info -e + +# 查看安装了哪些包 +conda list + +# 激活虚拟环境 +conda activate rag + +# 对虚拟环境中安装额外的包 +conda install -n rag $package_name + +# 删除虚拟环境 +conda remove -n rag --all + +# 删除环境中的某个包 +conda remove --name rag $package_name + +# 恢复默认镜像 +conda config --remove-key channels diff --git a/dsSchoolBuddy/Doc/3、Pip维护.txt b/dsSchoolBuddy/Doc/3、Pip维护.txt new file mode 100644 index 00000000..081e047b --- /dev/null +++ b/dsSchoolBuddy/Doc/3、Pip维护.txt @@ -0,0 +1,16 @@ +# 激活虚拟环境 +conda activate rag + +# 永久修改pip源为阿里云镜像源(适用于Windows系统) +pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ + +# 验证是否修改成功 +pip config list + +global.index-url='https://mirrors.aliyun.com/pypi/simple/' + +# 获取依赖了哪些包 +pip freeze > requirements.txt + +# 新机器安装包 +pip install -r D:\dsWork\dsProject\dsRag\requirements.txt \ No newline at end of file diff --git a/dsSchoolBuddy/Doc/4、Elasticsearch安装配置文档.md b/dsSchoolBuddy/Doc/4、Elasticsearch安装配置文档.md new file mode 100644 index 00000000..cac6d013 --- /dev/null +++ b/dsSchoolBuddy/Doc/4、Elasticsearch安装配置文档.md @@ -0,0 +1,132 @@ +### 一、安装 $ES$ + +**1、下载安装包** + +进入官网下载$linux$安装包 [下载地址](https://www.elastic.co/cn/downloads/elasticsearch) + +![img](https://i-blog.csdnimg.cn/direct/04ae4c7f65fe475fb19e913eaf80ba04.png) + +**2、安装$JDK$$21$** + +```sh +sudo yum install java-21-openjdk-devel +echo 'export JAVA_HOME=/usr/lib/jvm/java-21-openjdk +export PATH=$JAVA_HOME/bin:$PATH' >> ~/.bashrc + +source ~/.bashrc +``` + +**3、上传文件到$linux$服务器** + +```sh +# 如果没有 rz 命令 先安装 +yum -y install lrzsz + +# rz 打开弹窗 选择下载好的文件 确认 在哪个目录下执行,就会上传到该目录下 +rz -be +``` + + **4、新建用户并设置密码** + +```sh +# 创建用户 +useradd elauser + +# 设置密码 符合密码规范 大写 + 小写 + 数字 + 特殊字符 + 大于八位 +passwd elauser + +#输入密码: +DsideaL@123 + +tar -zxvf elasticsearch-9.0.2-linux-x86_64.tar.gz +sudo chown -R elauser:elauser /usr/local/elasticsearch-9.0.2 +# 进入解压文件并编辑配置文件 +cd elasticsearch-9.0.2/config +vi elasticsearch.yml +# 修改数据目录和日志目录 +mkdir -p /usr/local/elasticsearch-9.0.2/data +mkdir -p /usr/local/elasticsearch-9.0.2/logs +``` + +![](https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/BlogImages/{year}/{month}/{md5}.{extName}/20250623130022571.png) + +设置允许所有IP进行访问,在添加下面参数让$elasticsearch-head$插件可以访问$es$ + +![](https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/BlogImages/{year}/{month}/{md5}.{extName}/20250623130217136.png) + +```yaml +network.host: 0.0.0.0 +http.cors.enabled: true +http.cors.allow-origin: "*" +``` + +**5、修改系统配置** + +```sh +# m.max_map_count 值太低 +# 临时解决方案(需要root权限) +sudo sysctl -w vm.max_map_count=262144 + +# 永久解决方案(需要root权限) +echo "vm.max_map_count=262144" | sudo tee -a /etc/sysctl.conf +sudo sysctl -p + +# 验证是否有效 +sysctl vm.max_map_count +``` + +**6、启动** + +```sh +# 启动 +su - elauser + +cd /usr/local/elasticsearch-9.0.2/bin +# ./elasticsearch-keystore create + +# 启动 -d = damon 守护进程 +./elasticsearch -d + + +# 访问地址 +https://10.10.14.206:9200 + +# 日志文件 +/usr/local/elasticsearch-9.0.2/logs/elasticsearch.log +``` + + 弹出输入账号密码,这里需要重置下密码,再登录 进入安装目录的bin目录下 + +执行下面命令 就会在控制台打印出新密码 账号就是 elastic + +``` +./elasticsearch-reset-password -u elastic +``` + +![](https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/BlogImages/{year}/{month}/{md5}.{extName}/20250623132315148.png) + +登录成功,完活。 + +```sh +elastic +jv9h8uwRrRxmDi1dq6u8 +``` + + + +![](https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/BlogImages/{year}/{month}/{md5}.{extName}/20250623132417828.png) + +> **注意**:如果访问不到,请检查是否开启了$VPN$ + +### 二、安装$ik$中文分词插件 + +```bash +# 安装分词插件 +./bin/elasticsearch-plugin install https://get.infini.cloud/elasticsearch/analysis-ik/9.0.2 + +# 检查插件列表 +[elauser@maxkb elasticsearch-9.0.2]$ ./bin/elasticsearch-plugin list +analysis-ik +``` + +![](https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/BlogImages/{year}/{month}/{md5}.{extName}/20250623133924355.png) diff --git a/dsSchoolBuddy/ElasticSearch/T1_RebuildMapping.py b/dsSchoolBuddy/ElasticSearch/T1_RebuildMapping.py new file mode 100644 index 00000000..0d5799bf --- /dev/null +++ b/dsSchoolBuddy/ElasticSearch/T1_RebuildMapping.py @@ -0,0 +1,47 @@ +import warnings + +from elasticsearch import Elasticsearch + +from Config import Config + +# 抑制HTTPS相关警告 +warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') +warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') + +# 初始化ES连接 +es = Elasticsearch( + hosts=Config.ES_CONFIG['hosts'], + basic_auth=Config.ES_CONFIG['basic_auth'], + verify_certs=False +) + +# 定义mapping结构 +mapping = { + "mappings": { + "properties": { + "embedding": { + "type": "dense_vector", + "dims": 200, # embedding维度为200 + "index": True, + "similarity": "l2_norm" # 使用L2距离 + }, + "user_input": {"type": "text"}, + "tags": { + "type": "object", + "properties": { + "tags": {"type": "keyword"}, + "full_content": {"type": "text"} + } + } + } + } +} + +# 创建索引 +index_name = Config.ES_CONFIG['index_name'] +if es.indices.exists(index=index_name): + es.indices.delete(index=index_name) + print(f"删除已存在的索引 '{index_name}'") + +es.indices.create(index=index_name, body=mapping) +print(f"索引 '{index_name}' 创建成功,mapping结构已设置。") \ No newline at end of file diff --git a/dsSchoolBuddy/ElasticSearch/T2_SplitTxt.py b/dsSchoolBuddy/ElasticSearch/T2_SplitTxt.py new file mode 100644 index 00000000..7cfa4a61 --- /dev/null +++ b/dsSchoolBuddy/ElasticSearch/T2_SplitTxt.py @@ -0,0 +1,191 @@ +import os +import re +import shutil +import warnings +import zipfile + +from docx import Document +from docx.oxml.ns import nsmap + +from Util import DocxUtil + +# 抑制HTTPS相关警告 +warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') +warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') + + +def extract_images_from_docx(docx_path, output_folder): + """ + 从docx提取图片并记录位置 + :param docx_path: Word文档路径 + :param output_folder: 图片输出文件夹 + :return: 包含图片路径和位置的列表 + """ + # 从docx_path 的名称示例:小学数学教学中的若干问题_MATH_1.docx + # 则图片的前缀统一为 MATH_1_?.docx ,其中 ? 为数字,表示图片的序号 + # 先获取到前缀 + a = docx_path.split("_") + prefix = a[1] + "_" + a[2].split(".")[0] + # print(f"图片前缀为:{prefix}") + # 创建一个List 记录每个图片的名称和序号 + image_data = [] + # 创建临时解压目录 + temp_dir = os.path.join(output_folder, "temp_docx") + os.makedirs(temp_dir, exist_ok=True) + + # 解压docx文件 + with zipfile.ZipFile(docx_path, 'r') as zip_ref: + zip_ref.extractall(temp_dir) + + # 读取主文档关系 + with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file: + rels_content = rels_file.read() + + # 加载主文档 + doc = Document(docx_path) + img_counter = 1 + + # 遍历所有段落 + for para_idx, paragraph in enumerate(doc.paragraphs): + for run_idx, run in enumerate(paragraph.runs): + # 检查运行中的图形 + for element in run._element: + if element.tag.endswith('drawing'): + # 提取图片关系ID + blip = element.find('.//a:blip', namespaces=nsmap) + if blip is not None: + embed_id = blip.get('{%s}embed' % nsmap['r']) + + # 从关系文件中获取图片文件名 + rel_entry = f' 2 else line + elif line and line[0].isdigit(): + line = line[1:] + line = line.strip() + if in_block and line: # 只添加非空行 + current_block.append(line) + + if current_block: + blocks.append('\n'.join(current_block)) + + return [(i + 1, block) for i, block in enumerate(blocks)] + + +def save_to_txt(content, file_path, mode='w'): + """将内容保存到文本文件""" + try: + with open(file_path, mode, encoding='utf-8') as f: + f.write(content) + return True + except Exception as e: + print(f"保存文件{file_path}时出错: {str(e)}") + return False + + +class ImageReplacer: + def __init__(self, image_list): + self.image_list = image_list + self.current_idx = 0 + + def replace(self, match): + if self.current_idx < len(self.image_list): + result = f"![](./Images/{self.image_list[self.current_idx]})" + self.current_idx += 1 + return result + return match.group() + + +def process_document(docx_file, txt_output_dir, img_output_dir): + # 提取图片 + listImage = extract_images_from_docx(docx_file, img_output_dir) + print(f"图片数量为:{len(listImage)}") + + # 读取内容 + res = DocxUtil.get_docx_content_by_pandoc(docx_file) + # 分块 + chunks = split_into_blocks(res) + saved_count = 0 + + # 使用原来的正则表达式 + pattern = re.compile(r'【图片\d+】') + # 创建图片替换器 + replacer = ImageReplacer(listImage) + + for x in chunks: + firstLine = x[1].split("\n")[0].strip() + content = x[1][len(firstLine):].strip() + + # 使用类方法替换图片 + content = pattern.sub(replacer.replace, content) + # 保存文本文件 + # 从docx文件名提取学科和编号 + docx_name = os.path.basename(docx_file).split('.')[0] + subject_part = '_'.join(docx_name.split('_')[-2:]) # 获取最后两部分如CHINESE_1 + output_file = os.path.join(txt_output_dir, f"{subject_part}_{x[0]}.txt") + full_content = f"{firstLine}\n{content}" + if save_to_txt(full_content, output_file, mode='w'): + saved_count += 1 + + print(f"处理完成,共保存{saved_count}个文件到目录: {txt_output_dir}") + + +if __name__ == "__main__": + txt_output_dir = "../Txt/" + img_output_dir = "../static/Images/" + # 清空上面的两个输出目录,用os进行删除,在Windows环境中进行 + if os.path.exists(txt_output_dir): + shutil.rmtree(txt_output_dir) + if os.path.exists(img_output_dir): + shutil.rmtree(img_output_dir) + # 创建输出目录 + os.makedirs(txt_output_dir, exist_ok=True) + os.makedirs(img_output_dir, exist_ok=True) + + # 遍历static/Txt/下所有的docx + for filename in os.listdir("../static/Txt/"): + print("正在处理文件:" + filename) + # 这里需要文件的全称路径 + filename = os.path.join("../static/Txt/", filename) + process_document(filename, txt_output_dir, img_output_dir) diff --git a/dsSchoolBuddy/ElasticSearch/T3_InsertData.py b/dsSchoolBuddy/ElasticSearch/T3_InsertData.py new file mode 100644 index 00000000..a2d8921b --- /dev/null +++ b/dsSchoolBuddy/ElasticSearch/T3_InsertData.py @@ -0,0 +1,75 @@ +import warnings + +from Config import Config +from Config.Config import * +from elasticsearch import Elasticsearch +from gensim.models import KeyedVectors +import jieba +import os +import time + +# 抑制HTTPS相关警告 +warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') +warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') + +# 1. 加载预训练的 Word2Vec 模型 +model_path = MODEL_PATH +model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=MODEL_LIMIT) +print(f"模型加载成功,词向量维度: {model.vector_size}") + + +# 功能:将文本转换为嵌入向量 +def text_to_embedding(text): + words = jieba.lcut(text) + embeddings = [model[word] for word in words if word in model] + if embeddings: + return sum(embeddings) / len(embeddings) + return [0.0] * model.vector_size + + +# 2. 初始化Elasticsearch连接 +es = Elasticsearch( + hosts=Config.ES_CONFIG['hosts'], + basic_auth=Config.ES_CONFIG['basic_auth'], + verify_certs=False +) + +# 3. 处理processed_chunks目录下的所有文件 +txt_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'Txt') + +for filename in os.listdir(txt_dir): + if filename.endswith('.txt'): + filepath = os.path.join(txt_dir, filename) + with open(filepath, 'r', encoding='utf-8') as f: + # 只读取第一行作为向量计算 + first_line = f.readline().strip() + # 读取全部内容用于后续查询 + full_content = first_line + '\n' + f.read() + + if not first_line: + print(f"跳过空文件: {filename}") + continue + + print(f"正在处理文件: {filename}") + + # 4. 获取当前时间和会话ID + timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + # 需要进行标记的标签 + x = filename.split("_") + selectedTags = [x[0] + "_" + x[1]] + tags = {"tags": selectedTags, "full_content": full_content} # 添加完整内容 + + # 5. 将第一行文本转换为嵌入向量 + embedding = text_to_embedding(first_line) + + # 6. 插入数据到Elasticsearch + doc = { + 'tags': tags, + 'user_input': first_line, + 'timestamp': timestamp, + 'embedding': embedding + } + es.index(index=ES_CONFIG['index_name'], document=doc) + print(f"文件 {filename} 数据插入成功") + +print("所有文件处理完成") diff --git a/dsSchoolBuddy/ElasticSearch/T4_SelectAllData.py b/dsSchoolBuddy/ElasticSearch/T4_SelectAllData.py new file mode 100644 index 00000000..99418857 --- /dev/null +++ b/dsSchoolBuddy/ElasticSearch/T4_SelectAllData.py @@ -0,0 +1,55 @@ +import warnings + +from elasticsearch import Elasticsearch + +from Config import Config + +# 抑制HTTPS相关警告 +warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') +warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') + + +# 初始化Elasticsearch连接 +es = Elasticsearch( + hosts=Config.ES_CONFIG['hosts'], + basic_auth=Config.ES_CONFIG['basic_auth'], + verify_certs=False +) + +# 查询所有数据 +def select_all_data(index_name): + try: + # 构建查询条件 - 匹配所有文档 + # 修改查询条件为获取前10条数据 + query = { + "query": { + "match_all": {} + }, + "size": 1000 # 仅获取10条数据 + } + + # 执行查询 + response = es.search(index=index_name, body=query) + hits = response['hits']['hits'] + + if not hits: + print(f"索引 {index_name} 中没有数据") + else: + print(f"索引 {index_name} 中共有 {len(hits)} 条数据:") + for i, hit in enumerate(hits, 1): + print(f"{i}. ID: {hit['_id']}") + print(f" 内容: {hit['_source'].get('user_input', '')}") + print(f" 标签: {hit['_source'].get('tags', '')}") + print(f" 时间戳: {hit['_source'].get('timestamp', '')}") + # 在循环中添加打印完整数据结构的代码 + if i == 1: # 只打印第一条数据的完整结构 + print("完整数据结构:") + import pprint + pp = pprint.PrettyPrinter(indent=4) + pp.pprint(hit['_source']) + print("-" * 50) + except Exception as e: + print(f"查询出错: {e}") + +if __name__ == "__main__": + select_all_data(Config.ES_CONFIG['index_name']) \ No newline at end of file diff --git a/dsSchoolBuddy/ElasticSearch/T5_SelectByTags.py b/dsSchoolBuddy/ElasticSearch/T5_SelectByTags.py new file mode 100644 index 00000000..ae5e5d62 --- /dev/null +++ b/dsSchoolBuddy/ElasticSearch/T5_SelectByTags.py @@ -0,0 +1,79 @@ +from elasticsearch import Elasticsearch +import warnings +from Config import Config +from Config.Config import ES_CONFIG +import urllib3 + +# 抑制HTTPS相关警告 +warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') +warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') + + +# 1. 初始化Elasticsearch连接 +es = Elasticsearch( + hosts=Config.ES_CONFIG['hosts'], + basic_auth=Config.ES_CONFIG['basic_auth'], + verify_certs=False +) + +# 2. 直接在代码中指定要查询的标签 +query_tag = ["MATH_1"] # 可以修改为其他需要的标签 + +# 3. 构建查询条件 +query = { + "query": { + "bool": { + "should": [ + { + "terms": { + "tags.tags": query_tag + } + } + ], + "minimum_should_match": 1 + + } + }, + "size": 1000 +} + +# 4. 执行查询并处理结果 +try: + response = es.search(index="knowledge_base", body=query) + hits = response['hits']['hits'] + + if not hits: + print(f"未找到标签为 '{query_tag}' 的数据。") + else: + print(f"找到 {len(hits)} 条标签为 '{query_tag}' 的数据:") + for i, hit in enumerate(hits, 1): + print(f"{i}. ID: {hit['_id']}") + print(f" 内容: {hit['_source'].get('user_input', '')}") + print(f" 标签: {hit['_source']['tags']['tags']}") + print("-" * 50) +except Exception as e: + print(f"查询执行失败,错误详情: {str(e)}") + print(f"查询条件: {query}") + if hasattr(e, 'info') and isinstance(e.info, dict): + print(f"Elasticsearch错误详情: {e.info}") + if hasattr(e, 'status_code'): + print(f"HTTP状态码: {e.status_code}") + print(f"查询出错: {str(e)}") + +# 4. 执行查询 +try: + results = es.search(index=ES_CONFIG['index_name'], body=query) + print(f"查询标签 '{query_tag}' 结果:") + if results['hits']['hits']: + for hit in results['hits']['hits']: + doc = hit['_source'] + print(f"ID: {hit['_id']}") + print(f"标签: {doc['tags']['tags']}") + print(f"用户问题: {doc['user_input']}") + print(f"时间: {doc['timestamp']}") + print(f"向量: {doc['embedding'][:5]}...") + print("-" * 40) + else: + print(f"未找到标签为 '{query_tag}' 的数据。") +except Exception as e: + print(f"查询失败: {e}") \ No newline at end of file diff --git a/dsSchoolBuddy/ElasticSearch/T6_XiangLiangQuery.py b/dsSchoolBuddy/ElasticSearch/T6_XiangLiangQuery.py new file mode 100644 index 00000000..215ab7ad --- /dev/null +++ b/dsSchoolBuddy/ElasticSearch/T6_XiangLiangQuery.py @@ -0,0 +1,109 @@ +import logging +import warnings + +from Config.Config import ES_CONFIG +from Util.EsSearchUtil import EsSearchUtil + +# 初始化日志 +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# 初始化EsSearchUtil +esClient = EsSearchUtil(ES_CONFIG) +# 抑制HTTPS相关警告 +warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') +warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') + +if __name__ == "__main__": + # 测试查询 + # query = "小学数学中有哪些模型" + query = "文言虚词" + query_tags = ["MATH_1"] # 默认搜索标签,可修改 + print(f"\n=== 开始执行查询 ===") + print(f"原始查询文本: {query}") + + # 执行混合搜索 + es_conn = esClient.es_pool.get_connection() + try: + # 向量搜索 + print("\n=== 向量搜索阶段 ===") + print("1. 文本分词和向量化处理中...") + query_embedding = esClient.text_to_embedding(query) + print(f"2. 生成的查询向量维度: {len(query_embedding)}") + print(f"3. 前3维向量值: {query_embedding[:3]}") + + print("4. 正在执行Elasticsearch向量搜索...") + vector_results = es_conn.search( + index=ES_CONFIG['index_name'], + body={ + "query": { + "script_score": { + "query": { + "bool": { + "should": [ + { + "terms": { + "tags.tags": query_tags + } + } + ], + "minimum_should_match": 1 + } + }, + "script": { + "source": "double score = cosineSimilarity(params.query_vector, 'embedding'); return score >= 0 ? score : 0", + "params": {"query_vector": query_embedding} + } + } + }, + "size": 3 + } + ) + print(f"5. 向量搜索结果数量: {len(vector_results['hits']['hits'])}") + + # 文本精确搜索 + print("\n=== 文本精确搜索阶段 ===") + print("1. 正在执行Elasticsearch文本精确搜索...") + text_results = es_conn.search( + index=ES_CONFIG['index_name'], + body={ + "query": { + "bool": { + "must": [ + { + "match": { + "user_input": query + } + }, + { + "terms": { + "tags.tags": query_tags + } + } + ] + } + }, + "size": 3 + } + ) + print(f"2. 文本搜索结果数量: {len(text_results['hits']['hits'])}") + + # 打印详细结果 + print("\n=== 最终搜索结果 ===") + + vector_int = 0 + for i, hit in enumerate(vector_results['hits']['hits'], 1): + if hit['_score'] > 0.4: # 阀值0.4 + print(f" {i}. 文档ID: {hit['_id']}, 相似度分数: {hit['_score']:.2f}") + print(f" 内容: {hit['_source']['user_input']}") + vector_int = vector_int + 1 + print(f" 向量搜索结果: {vector_int}条") + + print("\n文本精确搜索结果:") + for i, hit in enumerate(text_results['hits']['hits']): + print(f" {i + 1}. 文档ID: {hit['_id']}, 匹配分数: {hit['_score']:.2f}") + print(f" 内容: {hit['_source']['user_input']}") + # print(f" 详细: {hit['_source']['tags']['full_content']}") + + finally: + esClient.es_pool.release_connection(es_conn) diff --git a/dsSchoolBuddy/ElasticSearch/Utils/ElasticsearchCollectionManager.py b/dsSchoolBuddy/ElasticSearch/Utils/ElasticsearchCollectionManager.py new file mode 100644 index 00000000..4db6a138 --- /dev/null +++ b/dsSchoolBuddy/ElasticSearch/Utils/ElasticsearchCollectionManager.py @@ -0,0 +1,111 @@ +from elasticsearch import Elasticsearch +from elasticsearch.exceptions import NotFoundError +import logging + +logger = logging.getLogger(__name__) + + +class ElasticsearchCollectionManager: + def __init__(self, index_name): + """ + 初始化Elasticsearch索引管理器 + :param index_name: Elasticsearch索引名称 + """ + self.index_name = index_name + + def load_collection(self, es_connection): + """ + 加载索引,如果不存在则创建 + :param es_connection: Elasticsearch连接 + """ + try: + if not es_connection.indices.exists(index=self.index_name): + logger.warning(f"Index {self.index_name} does not exist, creating new index") + self._create_index(es_connection) + except Exception as e: + logger.error(f"Failed to load collection: {str(e)}") + raise + + def _create_index(self, es_connection): + """创建新的Elasticsearch索引""" + mapping = { + "mappings": { + "properties": { + "user_input": {"type": "text"}, + "tags": { + "type": "object", + "properties": { + "tags": {"type": "keyword"}, + "full_content": {"type": "text"} + } + }, + "timestamp": {"type": "date"}, + "embedding": {"type": "dense_vector", "dims": 200} + } + } + } + es_connection.indices.create(index=self.index_name, body=mapping) + + def search(self, es_connection, query_embedding, search_params, expr=None, limit=5): + """ + 执行混合搜索(向量+关键字) + :param es_connection: Elasticsearch连接 + :param query_embedding: 查询向量 + :param search_params: 搜索参数 + :param expr: 过滤表达式 + :param limit: 返回结果数量 + :return: 搜索结果 + """ + try: + # 构建查询 + query = { + "query": { + "script_score": { + "query": { + "bool": { + "must": [] + } + }, + "script": { + "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0", + "params": {"query_vector": query_embedding} + } + } + }, + "size": limit + } + + # 添加标签过滤条件 + if expr: + query["query"]["script_score"]["query"]["bool"]["must"].append( + {"nested": { + "path": "tags", + "query": { + "terms": {"tags.tags": expr.split(" OR ")} + } + }} + ) + + logger.info(f"Executing search with query: {query}") + response = es_connection.search(index=self.index_name, body=query) + return response["hits"]["hits"] + except Exception as e: + logger.error(f"Search failed: {str(e)}") + raise + + def query_by_id(self, es_connection, doc_id): + """ + 根据ID查询文档 + :param es_connection: Elasticsearch连接 + :param doc_id: 文档ID + :return: 文档内容 + """ + try: + response = es_connection.get(index=self.index_name, id=doc_id) + return response["_source"] + except NotFoundError: + logger.warning(f"Document with id {doc_id} not found") + return None + except Exception as e: + logger.error(f"Failed to query document by id: {str(e)}") + raise \ No newline at end of file diff --git a/dsSchoolBuddy/ElasticSearch/Utils/ElasticsearchConnectionPool.py b/dsSchoolBuddy/ElasticSearch/Utils/ElasticsearchConnectionPool.py new file mode 100644 index 00000000..88a877fc --- /dev/null +++ b/dsSchoolBuddy/ElasticSearch/Utils/ElasticsearchConnectionPool.py @@ -0,0 +1,65 @@ +from elasticsearch import Elasticsearch +import threading +import logging + +logger = logging.getLogger(__name__) + + +class ElasticsearchConnectionPool: + def __init__(self, hosts, basic_auth, verify_certs=False, max_connections=50): + """ + 初始化Elasticsearch连接池 + :param hosts: Elasticsearch服务器地址 + :param basic_auth: 认证信息(username, password) + :param verify_certs: 是否验证SSL证书 + :param max_connections: 最大连接数 + """ + self.hosts = hosts + self.basic_auth = basic_auth + self.verify_certs = verify_certs + self.max_connections = max_connections + self._connections = [] + self._lock = threading.Lock() + self._initialize_pool() + + def _initialize_pool(self): + """初始化连接池""" + for _ in range(self.max_connections): + self._connections.append(self._create_connection()) + + def _create_connection(self): + """创建新的Elasticsearch连接""" + return Elasticsearch( + hosts=self.hosts, + basic_auth=self.basic_auth, + verify_certs=self.verify_certs + ) + + def get_connection(self): + """从连接池获取一个连接""" + with self._lock: + if not self._connections: + logger.warning("Connection pool exhausted, creating new connection") + return self._create_connection() + return self._connections.pop() + + def release_connection(self, connection): + """释放连接回连接池""" + with self._lock: + if len(self._connections) < self.max_connections: + self._connections.append(connection) + else: + try: + connection.close() + except Exception as e: + logger.warning(f"Failed to close connection: {str(e)}") + + def close(self): + """关闭所有连接""" + with self._lock: + for conn in self._connections: + try: + conn.close() + except Exception as e: + logger.warning(f"Failed to close connection: {str(e)}") + self._connections.clear() \ No newline at end of file diff --git a/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/ElasticsearchCollectionManager.cpython-310.pyc b/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/ElasticsearchCollectionManager.cpython-310.pyc new file mode 100644 index 00000000..05971151 Binary files /dev/null and b/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/ElasticsearchCollectionManager.cpython-310.pyc differ diff --git a/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/ElasticsearchConnectionPool.cpython-310.pyc b/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/ElasticsearchConnectionPool.cpython-310.pyc new file mode 100644 index 00000000..2c77d6cb Binary files /dev/null and b/dsSchoolBuddy/ElasticSearch/Utils/__pycache__/ElasticsearchConnectionPool.cpython-310.pyc differ diff --git a/dsSchoolBuddy/ElasticSearch/__init__.py b/dsSchoolBuddy/ElasticSearch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsSchoolBuddy/ElasticSearch/__pycache__/__init__.cpython-310.pyc b/dsSchoolBuddy/ElasticSearch/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 00000000..94ae6ce2 Binary files /dev/null and b/dsSchoolBuddy/ElasticSearch/__pycache__/__init__.cpython-310.pyc differ diff --git a/dsSchoolBuddy/Start.py b/dsSchoolBuddy/Start.py new file mode 100644 index 00000000..239a94c3 --- /dev/null +++ b/dsSchoolBuddy/Start.py @@ -0,0 +1,171 @@ +import json +import subprocess +import tempfile +import urllib.parse +import uuid +import warnings +from io import BytesIO + +import fastapi +import uvicorn +from fastapi import FastAPI, HTTPException +from openai import AsyncOpenAI +from sse_starlette import EventSourceResponse +from starlette.responses import StreamingResponse +from starlette.staticfiles import StaticFiles + +from Config import Config +from Util.EsSearchUtil import * +from Util.MySQLUtil import init_mysql_pool + +# 初始化日志 +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# 配置日志处理器 +log_file = os.path.join(os.path.dirname(__file__), 'Logs', 'app.log') +os.makedirs(os.path.dirname(log_file), exist_ok=True) + +# 文件处理器 +file_handler = RotatingFileHandler( + log_file, maxBytes=1024 * 1024, backupCount=5, encoding='utf-8') +file_handler.setFormatter(logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + +# 控制台处理器 +console_handler = logging.StreamHandler() +console_handler.setFormatter(logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + +logger.addHandler(file_handler) +logger.addHandler(console_handler) + +# 初始化异步 OpenAI 客户端 +client = AsyncOpenAI( + api_key=Config.MODEL_API_KEY, + base_url=Config.MODEL_API_URL, +) + + +async def lifespan(app: FastAPI): + # 抑制HTTPS相关警告 + warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') + warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') + yield + + +app = FastAPI(lifespan=lifespan) + +# 挂载静态文件目录 +app.mount("/static", StaticFiles(directory="Static"), name="static") + + +@app.post("/api/save-word") +async def save_to_word(request: fastapi.Request): + output_file = None + try: + # Parse request data + try: + data = await request.json() + markdown_content = data.get('markdown_content', '') + if not markdown_content: + raise ValueError("Empty MarkDown content") + except Exception as e: + logger.error(f"Request parsing failed: {str(e)}") + raise HTTPException(status_code=400, detail=f"Invalid request: {str(e)}") + + # 创建临时Markdown文件 + temp_md = os.path.join(tempfile.gettempdir(), uuid.uuid4().hex + ".md") + with open(temp_md, "w", encoding="utf-8") as f: + f.write(markdown_content) + + # 使用pandoc转换 + output_file = os.path.join(tempfile.gettempdir(), "【理想大模型】问答.docx") + subprocess.run(['pandoc', temp_md, '-o', output_file, '--resource-path=static'], check=True) + + # 读取生成的Word文件 + with open(output_file, "rb") as f: + stream = BytesIO(f.read()) + + # 返回响应 + encoded_filename = urllib.parse.quote("【理想大模型】问答.docx") + return StreamingResponse( + stream, + media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + headers={"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"}) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Unexpected error: {str(e)}") + raise HTTPException(status_code=500, detail="Internal server error") + finally: + # 清理临时文件 + try: + if temp_md and os.path.exists(temp_md): + os.remove(temp_md) + if output_file and os.path.exists(output_file): + os.remove(output_file) + except Exception as e: + logger.warning(f"Failed to clean up temp files: {str(e)}") + + +@app.post("/api/rag", response_model=None) +async def rag(request: fastapi.Request): + data = await request.json() + query = data.get('query', '') + query_tags = data.get('tags', []) + # 调用es进行混合搜索 + search_results = EsSearchUtil.queryByEs(query, query_tags, logger) + # 构建提示词 + context = "\n".join([ + f"结果{i + 1}: {res['tags']['full_content']}" + for i, res in enumerate(search_results['text_results']) + ]) + # 添加图片识别提示 + prompt = f""" + 信息检索与回答助手 + 根据以下关于'{query}'的相关信息: + + 相关信息 + {context} + + 回答要求 + 1. 对于公式内容: + - 使用行内格式:$公式$ + - 重要公式可单独一行显示 + - 绝对不要使用代码块格式(```或''') + - 可适当使用\large增大公式字号 + - 如果内容中包含数学公式,请使用行内格式,如$f(x) = x^2$ + - 如果内容中包含多个公式,请使用行内格式,如$f(x) = x^2$ $g(x) = x^3$ + 2. 如果没有提供任何资料,那就直接拒绝回答,明确不在知识范围内。 + 3. 如果发现提供的资料与要询问的问题都不相关,就拒绝回答,明确不在知识范围内。 + 4. 如果发现提供的资料中只有部分与问题相符,那就只提取有用的相关部分,其它部分请忽略。 + 5. 对于符合问题的材料中,提供了图片的,尽量保持上下文中的图片,并尽量保持图片的清晰度。 + """ + + async def generate_response_stream(): + try: + # 流式调用大模型 + stream = await client.chat.completions.create( + model=Config.MODEL_NAME, + messages=[ + {'role': 'user', 'content': prompt} + ], + max_tokens=8000, + stream=True # 启用流式模式 + ) + # 流式返回模型生成的回复 + async for chunk in stream: + if chunk.choices[0].delta.content: + yield f"data: {json.dumps({'reply': chunk.choices[0].delta.content}, ensure_ascii=False)}\n\n" + + except Exception as e: + yield f"data: {json.dumps({'error': str(e)})}\n\n" + + return EventSourceResponse(generate_response_stream()) + + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/dsSchoolBuddy/Test/__init__.py b/dsSchoolBuddy/Test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsSchoolBuddy/Util/EsSearchUtil.py b/dsSchoolBuddy/Util/EsSearchUtil.py new file mode 100644 index 00000000..a9a2e7ec --- /dev/null +++ b/dsSchoolBuddy/Util/EsSearchUtil.py @@ -0,0 +1,238 @@ +import logging +import os +from logging.handlers import RotatingFileHandler +import jieba +from gensim.models import KeyedVectors + +from Config.Config import MODEL_LIMIT, MODEL_PATH, ES_CONFIG +from ElasticSearch.Utils.ElasticsearchConnectionPool import ElasticsearchConnectionPool + +# 初始化日志 +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +# 确保日志目录存在 +os.makedirs('Logs', exist_ok=True) +handler = RotatingFileHandler('Logs/start.log', maxBytes=1024 * 1024, backupCount=5) +handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) +logger.addHandler(handler) + +class EsSearchUtil: + def __init__(self, es_config): + """ + 初始化Elasticsearch搜索工具 + :param es_config: Elasticsearch配置字典,包含hosts, username, password, index_name等 + """ + self.es_config = es_config + + # 初始化连接池 + self.es_pool = ElasticsearchConnectionPool( + hosts=es_config['hosts'], + basic_auth=es_config['basic_auth'], + verify_certs=es_config.get('verify_certs', False), + max_connections=50 + ) + + # 保留直接连接用于兼容 + from elasticsearch import Elasticsearch + self.es = Elasticsearch( + hosts=es_config['hosts'], + basic_auth=es_config['basic_auth'], + verify_certs=es_config.get('verify_certs', False) + ) + + # 确保es_conn属性存在以兼容旧代码 + self.es_conn = self.es + + # 确保es_conn属性存在以兼容旧代码 + self.es_conn = self.es + + + # 加载预训练模型 + self.model = KeyedVectors.load_word2vec_format(MODEL_PATH, binary=False, limit=MODEL_LIMIT) + logger.info(f"模型加载成功,词向量维度: {self.model.vector_size}") + + # 初始化Elasticsearch连接 + self.es = Elasticsearch( + hosts=es_config['hosts'], + basic_auth=es_config['basic_auth'], + verify_certs=False + ) + self.index_name = es_config['index_name'] + + def text_to_embedding(self, text): + # 使用已加载的模型 + # 对文本分词并计算平均向量 + words = jieba.lcut(text) + vectors = [self.model[word] for word in words if word in self.model] + + if not vectors: + return [0.0] * self.model.vector_size + + # 计算平均向量 + avg_vector = [sum(dim)/len(vectors) for dim in zip(*vectors)] + return avg_vector + + def vector_search(self, query, size=10): + query_embedding = self.text_to_embedding(query) + script_query = { + "script_score": { + "query": {"match_all": {}}, + "script": { + "source": "double score = cosineSimilarity(params.query_vector, 'embedding'); return score >= 0 ? score : 0", + "params": {"query_vector": query_embedding} + } + } + } + + return self.es_conn.search( + index=self.es_config['index_name'], + query=script_query, + size=size + ) + + def text_search(self, query, size=10): + return self.es_conn.search( + index=self.es_config['index_name'], + query={"match": {"user_input": query}}, + size=size + ) + + def hybrid_search(self, query, size=10): + """ + 执行混合搜索(向量搜索+文本搜索) + :param query: 搜索查询文本 + :param size: 返回结果数量 + :return: 包含两种搜索结果的字典 + """ + vector_results = self.vector_search(query, size) + text_results = self.text_search(query, size) + + return { + 'vector_results': vector_results, + 'text_results': text_results + } + + def search(self, query, search_type='hybrid', size=10): + """ + 统一搜索接口 + :param query: 搜索查询文本 + :param search_type: 搜索类型('vector', 'text' 或 'hybrid') + :param size: 返回结果数量 + :return: 搜索结果 + """ + if search_type == 'vector': + return self.vector_search(query, size) + elif search_type == 'text': + return self.text_search(query, size) + else: + return self.hybrid_search(query, size) + + def queryByEs(query, query_tags, logger): + # 获取EsSearchUtil实例 + es_search_util = EsSearchUtil(ES_CONFIG) + + # 执行混合搜索 + es_conn = es_search_util.es_pool.get_connection() + try: + # 向量搜索 + logger.info(f"\n=== 开始执行查询 ===") + logger.info(f"原始查询文本: {query}") + logger.info(f"查询标签: {query_tags}") + + logger.info("\n=== 向量搜索阶段 ===") + logger.info("1. 文本分词和向量化处理中...") + query_embedding = es_search_util.text_to_embedding(query) + logger.info(f"2. 生成的查询向量维度: {len(query_embedding)}") + logger.info(f"3. 前3维向量值: {query_embedding[:3]}") + + logger.info("4. 正在执行Elasticsearch向量搜索...") + vector_results = es_conn.search( + index=ES_CONFIG['index_name'], + body={ + "query": { + "script_score": { + "query": { + "bool": { + "should": [ + { + "terms": { + "tags.tags": query_tags + } + } + ], + "minimum_should_match": 1 + } + }, + "script": { + "source": "double score = cosineSimilarity(params.query_vector, 'embedding'); return score >= 0 ? score : 0", + "params": {"query_vector": query_embedding} + } + } + }, + "size": 3 + } + ) + # 处理一下,判断是否到达阀值 + filtered_vector_hits = [] + vector_int = 0 + for hit in vector_results['hits']['hits']: + if hit['_score'] > 0.8: # 阀值0.8 + # 新增语义相关性检查 + if all(word in hit['_source']['user_input'] for word in jieba.lcut(query)): + logger.info(f" {vector_int + 1}. 文档ID: {hit['_id']}, 相似度分数: {hit['_score']:.2f}") + logger.info(f" 内容: {hit['_source']['user_input']}") + filtered_vector_hits.append(hit) + vector_int += 1 + + # 更新vector_results只包含通过过滤的文档 + vector_results['hits']['hits'] = filtered_vector_hits + logger.info(f"5. 向量搜索结果数量(过滤后): {vector_int}") + + # 文本精确搜索 + logger.info("\n=== 文本精确搜索阶段 ===") + logger.info("1. 正在执行Elasticsearch文本精确搜索...") + text_results = es_conn.search( + index=ES_CONFIG['index_name'], + body={ + "query": { + "bool": { + "must": [ + { + "match": { + "user_input": query + } + }, + { + "terms": { + "tags.tags": query_tags + } + } + ] + } + }, + "size": 3 + } + ) + logger.info(f"2. 文本搜索结果数量: {len(text_results['hits']['hits'])}") + + # 合并vector和text结果 + all_sources = [hit['_source'] for hit in vector_results['hits']['hits']] + \ + [hit['_source'] for hit in text_results['hits']['hits']] + + # 去重处理 + unique_sources = [] + seen_user_inputs = set() + + for source in all_sources: + if source['user_input'] not in seen_user_inputs: + seen_user_inputs.add(source['user_input']) + unique_sources.append(source) + + logger.info(f"合并后去重结果数量: {len(unique_sources)}条") + + search_results = { + "text_results": unique_sources + } + return search_results + finally: + es_search_util.es_pool.release_connection(es_conn) \ No newline at end of file diff --git a/dsSchoolBuddy/Util/__init__.py b/dsSchoolBuddy/Util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsSchoolBuddy/Util/__pycache__/ALiYunUtil.cpython-310.pyc b/dsSchoolBuddy/Util/__pycache__/ALiYunUtil.cpython-310.pyc new file mode 100644 index 00000000..b26f07ed Binary files /dev/null and b/dsSchoolBuddy/Util/__pycache__/ALiYunUtil.cpython-310.pyc differ diff --git a/dsSchoolBuddy/Util/__pycache__/EsSearchUtil.cpython-310.pyc b/dsSchoolBuddy/Util/__pycache__/EsSearchUtil.cpython-310.pyc new file mode 100644 index 00000000..e7e38fcb Binary files /dev/null and b/dsSchoolBuddy/Util/__pycache__/EsSearchUtil.cpython-310.pyc differ diff --git a/dsSchoolBuddy/Util/__pycache__/SearchUtil.cpython-310.pyc b/dsSchoolBuddy/Util/__pycache__/SearchUtil.cpython-310.pyc new file mode 100644 index 00000000..ab6219cd Binary files /dev/null and b/dsSchoolBuddy/Util/__pycache__/SearchUtil.cpython-310.pyc differ diff --git a/dsSchoolBuddy/Util/__pycache__/__init__.cpython-310.pyc b/dsSchoolBuddy/Util/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 00000000..79767003 Binary files /dev/null and b/dsSchoolBuddy/Util/__pycache__/__init__.cpython-310.pyc differ