You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
from datetime import datetime
|
|
|
|
|
from elasticsearch import Elasticsearch
|
|
|
|
|
|
|
|
|
|
from Config.Config import ES_CONFIG
|
|
|
|
|
from T2_Txt2Vec import text_to_embedding
|
|
|
|
|
from Util.EsMappingUtil import create_vector_index
|
|
|
|
|
|
|
|
|
|
# 初始化ES连接
|
|
|
|
|
es = Elasticsearch(
|
|
|
|
|
hosts=ES_CONFIG["hosts"],
|
|
|
|
|
basic_auth=ES_CONFIG["basic_auth"],
|
|
|
|
|
verify_certs=ES_CONFIG["verify_certs"],
|
|
|
|
|
ssl_show_warn=ES_CONFIG["ssl_show_warn"]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def save_vector(text, index_name="knowledge_base"):
|
|
|
|
|
"""将文本向量化后保存到ES"""
|
|
|
|
|
try:
|
|
|
|
|
# 向量化文本
|
|
|
|
|
vector = text_to_embedding(text)
|
|
|
|
|
|
|
|
|
|
# 准备文档
|
|
|
|
|
doc = {
|
|
|
|
|
"text": text,
|
|
|
|
|
"vector": vector,
|
|
|
|
|
"timestamp": datetime.now()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 保存到ES
|
|
|
|
|
res = es.index(index=index_name, document=doc)
|
|
|
|
|
print(f"向量化文档已保存,ID: {res['_id']}")
|
|
|
|
|
return res['_id']
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"保存失败: {str(e)}")
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
def save_raw_text(text, index_name="raw_texts"):
|
|
|
|
|
"""保存原始文本到ES"""
|
|
|
|
|
try:
|
|
|
|
|
# 准备文档
|
|
|
|
|
doc = {
|
|
|
|
|
"text": text,
|
|
|
|
|
"timestamp": datetime.now()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 保存到ES
|
|
|
|
|
res = es.index(index=index_name, document=doc)
|
|
|
|
|
print(f"原始文本已保存,ID: {res['_id']}")
|
|
|
|
|
return res['_id']
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"保存失败: {str(e)}")
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
# 使用示例
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
# 创建向量索引
|
|
|
|
|
create_vector_index(dims=200)
|
|
|
|
|
|
|
|
|
|
# 示例文本
|
|
|
|
|
sample_text = "如何更换支付宝绑定银行卡"
|
|
|
|
|
|
|
|
|
|
# 保存向量化文档
|
|
|
|
|
save_vector(sample_text)
|
|
|
|
|
|
|
|
|
|
# 保存原始文本
|
|
|
|
|
save_raw_text(sample_text)
|