'commit'

4 weeks ago · 049172089d
parent ad732b1117
commit 049172089d
1 changed files with 5 additions and 5 deletions
--- a/dsRag/ElasticSearch/T3_InsertData.py
+++ b/dsRag/ElasticSearch/T3_InsertData.py
@ -6,9 +6,6 @@ import jieba
 import os
 import time

-# 需要进行标记的标签
-selectedTags = ["MATH_DATA_1", "小学数学"]
-
 # 1. 加载预训练的 Word2Vec 模型
 model_path = MS_MODEL_PATH
 model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=MS_MODEL_LIMIT)
@ -32,7 +29,7 @@ es = Elasticsearch(
 )

 # 3. 处理processed_chunks目录下的所有文件
-txt_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'Txt', 'processed_chunks')
+txt_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'Txt')

 for filename in os.listdir(txt_dir):
    if filename.endswith('.txt'):
@ -51,6 +48,9 @@ for filename in os.listdir(txt_dir):

            # 4. 获取当前时间和会话ID
            timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            # 需要进行标记的标签
+            x = filename.split("_")
+            selectedTags = [x[0] + "_" + x[1]]
            tags = {"tags": selectedTags, "full_content": full_content}  # 添加完整内容

            # 5. 将第一行文本转换为嵌入向量
@ -66,4 +66,4 @@ for filename in os.listdir(txt_dir):
            es.index(index=ES_CONFIG['index_name'], document=doc)
            print(f"文件 {filename} 数据插入成功")

-print("所有文件处理完成")
+print("所有文件处理完成")