|
|
|
@ -1,6 +1,17 @@
|
|
|
|
|
import logging
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from elasticsearch import Elasticsearch
|
|
|
|
|
|
|
|
|
|
# 配置日志
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
|
|
|
handlers=[
|
|
|
|
|
logging.FileHandler('sentence_save.log'),
|
|
|
|
|
logging.StreamHandler()
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
from Config.Config import ES_CONFIG
|
|
|
|
|
from T2_Txt2Vec import text_to_embedding
|
|
|
|
|
|
|
|
|
@ -38,22 +49,20 @@ def check_text_exists(text, index_name="raw_texts"):
|
|
|
|
|
def save_vector(text, index_name="knowledge_base"):
|
|
|
|
|
"""将文本向量化后保存到ES"""
|
|
|
|
|
try:
|
|
|
|
|
# 向量化文本
|
|
|
|
|
logging.info(f"开始向量化文本: {text[:50]}...")
|
|
|
|
|
vector = text_to_embedding(text)
|
|
|
|
|
|
|
|
|
|
# 准备文档
|
|
|
|
|
doc = {
|
|
|
|
|
"text": text,
|
|
|
|
|
"vector": vector,
|
|
|
|
|
"timestamp": datetime.now()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 保存到ES
|
|
|
|
|
res = es.index(index=index_name, document=doc)
|
|
|
|
|
print(f"向量化文档已保存,ID: {res['_id']}")
|
|
|
|
|
logging.info(f"成功保存向量化文档到{index_name}, ID: {res['_id']}, 文本长度: {len(text)}")
|
|
|
|
|
return res['_id']
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"保存失败: {str(e)}")
|
|
|
|
|
logging.error(f"向量保存失败: {str(e)}", exc_info=True)
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
def save_raw_text(text, index_name="raw_texts"):
|
|
|
|
|