'commit'

1 month ago · 90307fd7dd
parent e1d2472eba
commit 90307fd7dd
3 changed files with 52 additions and 1 deletions
--- a/dsRag/T4_ManageMapping.py
+++ b/dsRag/T4_ManageMapping.py
@ -45,3 +45,51 @@ if __name__ == "__main__":
    
    # 创建新的原始文本索引
    manage_index("create", "text")
+
+# 修改knowledge_base索引的mapping
+knowledge_base_mapping = {
+    "properties": {
+        # 在knowledge_base_mapping中添加
+        "content": {
+            "type": "text",
+            "analyzer": "ik_max_word",
+            "search_analyzer": "ik_smart",
+            "fields": {
+                "keyword": {
+                    "type": "keyword",
+                    "ignore_above": 8192  # 可以设置为1024/2048等更大值
+                }
+            }
+        },
+        # 在raw_texts_mapping中添加
+        "raw_text": {
+            "type": "text",
+            "analyzer": "ik_max_word",
+            "fielddata": True  # 允许对长文本进行聚合
+        },
+        "vector": {
+            "type": "dense_vector",
+            "dims": 200,
+            "index": True,
+            "similarity": "cosine"
+        },
+        "timestamp": {
+            "type": "date",
+            "format": "strict_date_optional_time||epoch_millis"
+        }
+    }
+}
+
+# 修改raw_texts索引的mapping
+raw_texts_mapping = {
+    "properties": {
+        "raw_text": {
+            "type": "text",
+            "analyzer": "ik_max_word"
+        },
+        "timestamp": {
+            "type": "date",
+            "format": "strict_date_optional_time||epoch_millis"
+        }
+    }
+}
--- a/dsRag/T6_Train.py
+++ b/dsRag/T6_Train.py
@ -11,7 +11,9 @@ import numpy as np

 def split_sentences(text):
    """按句分割文本"""
-    sentences = re.split(r'[。！？；\n]', text)
+    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
+    # 使用jieba进行分句
+    sentences = re.split(r'[。！？；\n]', text)  # 添加这行定义sentences
    return [s.strip() for s in sentences if s.strip()]

 def save_to_es(text):
--- a/dsRag/T9_RAG.py
+++ b/dsRag/T9_RAG.py
@ -5,6 +5,7 @@ from elasticsearch import Elasticsearch
 from Util.EmbeddingUtil import text_to_embedding
 from openai import OpenAI
 from Config import Config
+
 # 初始化ES连接
 es = Elasticsearch(
    hosts=Config.ES_CONFIG['hosts'],