From 90307fd7ddb463c70b177916a02f50e3638e07a3 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 23 Jun 2025 20:38:38 +0800 Subject: [PATCH] 'commit' --- dsRag/T4_ManageMapping.py | 48 +++++++++++++++++++++++++++++++++++++++ dsRag/T6_Train.py | 4 +++- dsRag/T9_RAG.py | 1 + 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/dsRag/T4_ManageMapping.py b/dsRag/T4_ManageMapping.py index f7015797..654a4fce 100644 --- a/dsRag/T4_ManageMapping.py +++ b/dsRag/T4_ManageMapping.py @@ -45,3 +45,51 @@ if __name__ == "__main__": # 创建新的原始文本索引 manage_index("create", "text") + +# 修改knowledge_base索引的mapping +knowledge_base_mapping = { + "properties": { + # 在knowledge_base_mapping中添加 + "content": { + "type": "text", + "analyzer": "ik_max_word", + "search_analyzer": "ik_smart", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 8192 # 可以设置为1024/2048等更大值 + } + } + }, + # 在raw_texts_mapping中添加 + "raw_text": { + "type": "text", + "analyzer": "ik_max_word", + "fielddata": True # 允许对长文本进行聚合 + }, + "vector": { + "type": "dense_vector", + "dims": 200, + "index": True, + "similarity": "cosine" + }, + "timestamp": { + "type": "date", + "format": "strict_date_optional_time||epoch_millis" + } + } +} + +# 修改raw_texts索引的mapping +raw_texts_mapping = { + "properties": { + "raw_text": { + "type": "text", + "analyzer": "ik_max_word" + }, + "timestamp": { + "type": "date", + "format": "strict_date_optional_time||epoch_millis" + } + } +} diff --git a/dsRag/T6_Train.py b/dsRag/T6_Train.py index 86d67a7d..d79767e0 100644 --- a/dsRag/T6_Train.py +++ b/dsRag/T6_Train.py @@ -11,7 +11,9 @@ import numpy as np def split_sentences(text): """按句分割文本""" - sentences = re.split(r'[。!?;\n]', text) + paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()] + # 使用jieba进行分句 + sentences = re.split(r'[。!?;\n]', text) # 添加这行定义sentences return [s.strip() for s in sentences if s.strip()] def save_to_es(text): diff --git a/dsRag/T9_RAG.py b/dsRag/T9_RAG.py index 1d14c448..3263a9ef 100644 --- a/dsRag/T9_RAG.py +++ b/dsRag/T9_RAG.py @@ -5,6 +5,7 @@ from elasticsearch import Elasticsearch from Util.EmbeddingUtil import text_to_embedding from openai import OpenAI from Config import Config + # 初始化ES连接 es = Elasticsearch( hosts=Config.ES_CONFIG['hosts'],