QingLong/AI/WxMini/Utils/EmbeddingUtil.py

import logging
import jieba
from gensim.models import KeyedVectors

# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# 初始化 Word2Vec 模型
model_path = r"D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt"
model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000)
logger.info(f"模型加载成功，词向量维度: {model.vector_size}")

# 将文本转换为嵌入向量
def text_to_embedding(text):
    words = jieba.lcut(text)  # 使用 jieba 分词
    logger.info(f"文本: {text}, 分词结果: {words}")
    embeddings = [model[word] for word in words if word in model]
    logger.info(f"有效词向量数量: {len(embeddings)}")
    if embeddings:
        avg_embedding = sum(embeddings) / len(embeddings)
        logger.info(f"生成的平均向量: {avg_embedding[:5]}...")  # 打印前 5 维
        return avg_embedding
    else:
        logger.warning("未找到有效词，返回零向量")
        return [0.0] * model.vector_size