main
HuangHai 1 month ago
parent 9adf37e76d
commit bd28335d5d

@ -5,4 +5,6 @@ ES_CONFIG = {
"verify_certs": False,
"ssl_show_warn": False,
"default_index": "knowledge_base"
}
}
WORD2VEC_MODEL_PATH = r"D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt"

@ -5,10 +5,16 @@ pip install jieba
'''
from Util.EmbeddingUtil import *
from Config.Config import *
# 在加载模型后添加自定义词典
jieba.load_userdict("./Dict/custom_dict.txt") # 文件中包含: 花呗
sentences = ['如何更换花呗绑定银行卡', '花呗更改绑定银行卡']
for sentence in sentences:
x = text_to_embedding(sentence)
x = text_to_embedding(sentence)
# 词汇数+向量维度
with open(WORD2VEC_MODEL_PATH, 'r', encoding='utf-8') as f:
print("词汇数和向量维度:" + f.readline()) # 第一行为词汇数和向量维度,在这里不予展示

@ -1,8 +1,9 @@
from Config.Config import ES_CONFIG
import datetime
from elasticsearch import Elasticsearch
from Config.Config import ES_CONFIG
from T2_Txt2Vec import text_to_embedding
import datetime
import warnings
# 初始化ES连接
es = Elasticsearch(
@ -12,16 +13,15 @@ es = Elasticsearch(
ssl_show_warn=ES_CONFIG["ssl_show_warn"]
)
# 修改create_vector_index和save_to_es函数中使用ES_CONFIG["default_index"]
def create_vector_index(index_name="knowledge_base"):
"""创建带有向量字段的索引"""
"""创建带有向量字段的索引(适配200维腾讯词向量)"""
mapping = {
"mappings": {
"properties": {
"text": {"type": "text", "analyzer": "ik_max_word"},
"vector": {
"type": "dense_vector",
"dims": 768, # 需与text2vec模型维度一致
"dims": 200, # 修改为腾讯词向量实际维度
"index": True,
"similarity": "cosine"
},
@ -34,7 +34,7 @@ def create_vector_index(index_name="knowledge_base"):
if es.indices.exists(index=index_name):
es.indices.delete(index=index_name)
es.indices.create(index=index_name, body=mapping)
print(f"索引 {index_name} 创建成功")
print(f"索引 {index_name} 创建成功(200维)")
except Exception as e:
print(f"创建索引失败: {str(e)}")
raise

@ -1,14 +1,13 @@
import logging
import jieba
from gensim.models import KeyedVectors
from Config.Config import *
# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# 初始化 Word2Vec 模型
model_path = r"D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt"
model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000)
model = KeyedVectors.load_word2vec_format(WORD2VEC_MODEL_PATH, binary=False, limit=10000)
logger.info(f"模型加载成功,词向量维度: {model.vector_size}")
# 将文本转换为嵌入向量

Loading…
Cancel
Save