diff --git a/dsRag/Config/Config.py b/dsRag/Config/Config.py index fec10fc7..7c522ab4 100644 --- a/dsRag/Config/Config.py +++ b/dsRag/Config/Config.py @@ -5,4 +5,6 @@ ES_CONFIG = { "verify_certs": False, "ssl_show_warn": False, "default_index": "knowledge_base" -} \ No newline at end of file +} + +WORD2VEC_MODEL_PATH = r"D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt" diff --git a/dsRag/Config/__pycache__/Config.cpython-310.pyc b/dsRag/Config/__pycache__/Config.cpython-310.pyc index 79c3fc90..b01f98ae 100644 Binary files a/dsRag/Config/__pycache__/Config.cpython-310.pyc and b/dsRag/Config/__pycache__/Config.cpython-310.pyc differ diff --git a/dsRag/T2_Txt2Vec.py b/dsRag/T2_Txt2Vec.py index 9e45c34f..98454d96 100644 --- a/dsRag/T2_Txt2Vec.py +++ b/dsRag/T2_Txt2Vec.py @@ -5,10 +5,16 @@ pip install jieba ''' from Util.EmbeddingUtil import * +from Config.Config import * + # 在加载模型后添加自定义词典 jieba.load_userdict("./Dict/custom_dict.txt") # 文件中包含: 花呗 sentences = ['如何更换花呗绑定银行卡', '花呗更改绑定银行卡'] for sentence in sentences: - x = text_to_embedding(sentence) \ No newline at end of file + x = text_to_embedding(sentence) + +# 词汇数+向量维度 +with open(WORD2VEC_MODEL_PATH, 'r', encoding='utf-8') as f: + print("词汇数和向量维度:" + f.readline()) # 第一行为词汇数和向量维度,在这里不予展示 diff --git a/dsRag/T4_VectorSave.py b/dsRag/T4_VectorSave.py index e66fb5e9..4d49baf2 100644 --- a/dsRag/T4_VectorSave.py +++ b/dsRag/T4_VectorSave.py @@ -1,8 +1,9 @@ -from Config.Config import ES_CONFIG +import datetime + from elasticsearch import Elasticsearch + +from Config.Config import ES_CONFIG from T2_Txt2Vec import text_to_embedding -import datetime -import warnings # 初始化ES连接 es = Elasticsearch( @@ -12,16 +13,15 @@ es = Elasticsearch( ssl_show_warn=ES_CONFIG["ssl_show_warn"] ) -# 修改create_vector_index和save_to_es函数中使用ES_CONFIG["default_index"] def create_vector_index(index_name="knowledge_base"): - """创建带有向量字段的索引""" + """创建带有向量字段的索引(适配200维腾讯词向量)""" mapping = { "mappings": { "properties": { "text": {"type": "text", "analyzer": "ik_max_word"}, "vector": { "type": "dense_vector", - "dims": 768, # 需与text2vec模型维度一致 + "dims": 200, # 修改为腾讯词向量实际维度 "index": True, "similarity": "cosine" }, @@ -34,7 +34,7 @@ def create_vector_index(index_name="knowledge_base"): if es.indices.exists(index=index_name): es.indices.delete(index=index_name) es.indices.create(index=index_name, body=mapping) - print(f"索引 {index_name} 创建成功") + print(f"索引 {index_name} 创建成功(200维)") except Exception as e: print(f"创建索引失败: {str(e)}") raise diff --git a/dsRag/Util/EmbeddingUtil.py b/dsRag/Util/EmbeddingUtil.py index fde367c0..8a0031b4 100644 --- a/dsRag/Util/EmbeddingUtil.py +++ b/dsRag/Util/EmbeddingUtil.py @@ -1,14 +1,13 @@ import logging import jieba from gensim.models import KeyedVectors - +from Config.Config import * # 配置日志 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) # 初始化 Word2Vec 模型 -model_path = r"D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt" -model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000) +model = KeyedVectors.load_word2vec_format(WORD2VEC_MODEL_PATH, binary=False, limit=10000) logger.info(f"模型加载成功,词向量维度: {model.vector_size}") # 将文本转换为嵌入向量 diff --git a/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc b/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc index a42cd0ed..899fae43 100644 Binary files a/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc and b/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc differ