diff --git a/dsRag/ElasticSearch/T3_InsertData.py b/dsRag/ElasticSearch/T3_InsertData.py index af689afe..a1023675 100644 --- a/dsRag/ElasticSearch/T3_InsertData.py +++ b/dsRag/ElasticSearch/T3_InsertData.py @@ -6,9 +6,6 @@ import jieba import os import time -# 需要进行标记的标签 -selectedTags = ["MATH_DATA_1", "小学数学"] - # 1. 加载预训练的 Word2Vec 模型 model_path = MS_MODEL_PATH model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=MS_MODEL_LIMIT) @@ -32,7 +29,7 @@ es = Elasticsearch( ) # 3. 处理processed_chunks目录下的所有文件 -txt_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'Txt', 'processed_chunks') +txt_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'Txt') for filename in os.listdir(txt_dir): if filename.endswith('.txt'): @@ -51,6 +48,9 @@ for filename in os.listdir(txt_dir): # 4. 获取当前时间和会话ID timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + # 需要进行标记的标签 + x = filename.split("_") + selectedTags = [x[0] + "_" + x[1]] tags = {"tags": selectedTags, "full_content": full_content} # 添加完整内容 # 5. 将第一行文本转换为嵌入向量 @@ -66,4 +66,4 @@ for filename in os.listdir(txt_dir): es.index(index=ES_CONFIG['index_name'], document=doc) print(f"文件 {filename} 数据插入成功") -print("所有文件处理完成") \ No newline at end of file +print("所有文件处理完成")