From 90307fd7ddb463c70b177916a02f50e3638e07a3 Mon Sep 17 00:00:00 2001
From: HuangHai <10402852@qq.com>
Date: Mon, 23 Jun 2025 20:38:38 +0800
Subject: [PATCH] 'commit'

---
 dsRag/T4_ManageMapping.py | 48 +++++++++++++++++++++++++++++++++++++++
 dsRag/T6_Train.py         |  4 +++-
 dsRag/T9_RAG.py           |  1 +
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/dsRag/T4_ManageMapping.py b/dsRag/T4_ManageMapping.py
index f7015797..654a4fce 100644
--- a/dsRag/T4_ManageMapping.py
+++ b/dsRag/T4_ManageMapping.py
@@ -45,3 +45,51 @@ if __name__ == "__main__":
     
     # 创建新的原始文本索引
     manage_index("create", "text")
+
+# 修改knowledge_base索引的mapping
+knowledge_base_mapping = {
+    "properties": {
+        # 在knowledge_base_mapping中添加
+        "content": {
+            "type": "text",
+            "analyzer": "ik_max_word",
+            "search_analyzer": "ik_smart",
+            "fields": {
+                "keyword": {
+                    "type": "keyword",
+                    "ignore_above": 8192  # 可以设置为1024/2048等更大值
+                }
+            }
+        },
+        # 在raw_texts_mapping中添加
+        "raw_text": {
+            "type": "text",
+            "analyzer": "ik_max_word",
+            "fielddata": True  # 允许对长文本进行聚合
+        },
+        "vector": {
+            "type": "dense_vector",
+            "dims": 200,
+            "index": True,
+            "similarity": "cosine"
+        },
+        "timestamp": {
+            "type": "date",
+            "format": "strict_date_optional_time||epoch_millis"
+        }
+    }
+}
+
+# 修改raw_texts索引的mapping
+raw_texts_mapping = {
+    "properties": {
+        "raw_text": {
+            "type": "text",
+            "analyzer": "ik_max_word"
+        },
+        "timestamp": {
+            "type": "date",
+            "format": "strict_date_optional_time||epoch_millis"
+        }
+    }
+}
diff --git a/dsRag/T6_Train.py b/dsRag/T6_Train.py
index 86d67a7d..d79767e0 100644
--- a/dsRag/T6_Train.py
+++ b/dsRag/T6_Train.py
@@ -11,7 +11,9 @@ import numpy as np
 
 def split_sentences(text):
     """按句分割文本"""
-    sentences = re.split(r'[。！？；\n]', text)
+    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
+    # 使用jieba进行分句
+    sentences = re.split(r'[。！？；\n]', text)  # 添加这行定义sentences
     return [s.strip() for s in sentences if s.strip()]
 
 def save_to_es(text):
diff --git a/dsRag/T9_RAG.py b/dsRag/T9_RAG.py
index 1d14c448..3263a9ef 100644
--- a/dsRag/T9_RAG.py
+++ b/dsRag/T9_RAG.py
@@ -5,6 +5,7 @@ from elasticsearch import Elasticsearch
 from Util.EmbeddingUtil import text_to_embedding
 from openai import OpenAI
 from Config import Config
+
 # 初始化ES连接
 es = Elasticsearch(
     hosts=Config.ES_CONFIG['hosts'],