main
HuangHai 1 month ago
parent e1d2472eba
commit 90307fd7dd

@ -45,3 +45,51 @@ if __name__ == "__main__":
# 创建新的原始文本索引
manage_index("create", "text")
# 修改knowledge_base索引的mapping
knowledge_base_mapping = {
"properties": {
# 在knowledge_base_mapping中添加
"content": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 8192 # 可以设置为1024/2048等更大值
}
}
},
# 在raw_texts_mapping中添加
"raw_text": {
"type": "text",
"analyzer": "ik_max_word",
"fielddata": True # 允许对长文本进行聚合
},
"vector": {
"type": "dense_vector",
"dims": 200,
"index": True,
"similarity": "cosine"
},
"timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
}
# 修改raw_texts索引的mapping
raw_texts_mapping = {
"properties": {
"raw_text": {
"type": "text",
"analyzer": "ik_max_word"
},
"timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
}

@ -11,7 +11,9 @@ import numpy as np
def split_sentences(text):
"""按句分割文本"""
sentences = re.split(r'[。!?;\n]', text)
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
# 使用jieba进行分句
sentences = re.split(r'[。!?;\n]', text) # 添加这行定义sentences
return [s.strip() for s in sentences if s.strip()]
def save_to_es(text):

@ -5,6 +5,7 @@ from elasticsearch import Elasticsearch
from Util.EmbeddingUtil import text_to_embedding
from openai import OpenAI
from Config import Config
# 初始化ES连接
es = Elasticsearch(
hosts=Config.ES_CONFIG['hosts'],

Loading…
Cancel
Save