import warnings from Config import Config from Config.Config import * from elasticsearch import Elasticsearch from gensim.models import KeyedVectors import jieba import os import time # 抑制HTTPS相关警告 warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') # 1. 加载预训练的 Word2Vec 模型 model_path = MODEL_PATH model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=MODEL_LIMIT) print(f"模型加载成功,词向量维度: {model.vector_size}") # 功能:将文本转换为嵌入向量 def text_to_embedding(text): words = jieba.lcut(text) embeddings = [model[word] for word in words if word in model] if embeddings: return sum(embeddings) / len(embeddings) return [0.0] * model.vector_size # 2. 初始化Elasticsearch连接 es = Elasticsearch( hosts=Config.ES_CONFIG['hosts'], basic_auth=Config.ES_CONFIG['basic_auth'], verify_certs=False ) # 3. 处理processed_chunks目录下的所有文件 txt_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'Txt') for filename in os.listdir(txt_dir): if filename.endswith('.txt'): filepath = os.path.join(txt_dir, filename) with open(filepath, 'r', encoding='utf-8') as f: # 只读取第一行作为向量计算 first_line = f.readline().strip() # 读取全部内容用于后续查询 full_content = first_line + '\n' + f.read() if not first_line: print(f"跳过空文件: {filename}") continue print(f"正在处理文件: {filename}") # 4. 获取当前时间和会话ID timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 需要进行标记的标签 x = filename.split("_") selectedTags = [x[0] + "_" + x[1]] tags = {"tags": selectedTags, "full_content": full_content} # 添加完整内容 # 5. 将第一行文本转换为嵌入向量 embedding = text_to_embedding(first_line) # 6. 插入数据到Elasticsearch doc = { 'tags': tags, 'user_input': first_line, 'timestamp': timestamp, 'embedding': embedding } es.index(index=ES_CONFIG['index_name'], document=doc) print(f"文件 {filename} 数据插入成功") print("所有文件处理完成")