From 8e7f71181cddc57d0ff3fdac216b800b4a0e1a3e Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Fri, 27 Jun 2025 07:50:31 +0800 Subject: [PATCH] 'commit' --- dsRag/Milvus/X4_InsertData.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dsRag/Milvus/X4_InsertData.py b/dsRag/Milvus/X4_InsertData.py index ba0e832f..0420019b 100644 --- a/dsRag/Milvus/X4_InsertData.py +++ b/dsRag/Milvus/X4_InsertData.py @@ -6,6 +6,9 @@ import jieba import os import time +# 需要进行标记的标签 +selectedTags = ["MATH_DATA_2", "小学数学"] + # 1. 加载预训练的 Word2Vec 模型 model_path = MS_MODEL_PATH model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=MS_MODEL_LIMIT) @@ -40,16 +43,16 @@ for filename in os.listdir(txt_dir): first_line = f.readline().strip() # 读取全部内容用于后续查询 full_content = first_line + '\n' + f.read() - + if not first_line: print(f"跳过空文件: {filename}") continue - + print(f"正在处理文件: {filename}") # 5. 获取当前时间和会话ID timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - tags = {"tags": ["MATH_DATA_1", "小学数学"], "full_content": full_content} # 添加完整内容 + tags = {"tags": selectedTags, "full_content": full_content} # 添加完整内容 # 6. 将第一行文本转换为嵌入向量 embedding = text_to_embedding(first_line)