diff --git a/dsRag/T2_Txt2Vec.py b/dsRag/T2_Txt2Vec.py index fc99afa3..77e28f7c 100644 --- a/dsRag/T2_Txt2Vec.py +++ b/dsRag/T2_Txt2Vec.py @@ -2,19 +2,15 @@ # 安装向量化的包 (# 断开VPN后执行安装包) conda activate rag pip install text2vec torch torchvision torchaudio + +# 安装完整版本(包含额外依赖) +pip install gensim[complete] ''' +from Util.EmbeddingUtil import * -from text2vec import SentenceModel sentences = ['如何更换花呗绑定银行卡', '花呗更改绑定银行卡'] -''' -- 自动下载预训练模型到缓存目录(通常是 ~/.cache/huggingface/hub ) -- 后续运行会直接使用缓存 -如果下载慢,可以设置镜像源: -import os -os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' -''' -model = SentenceModel('shibing624/text2vec-base-chinese') -embeddings = model.encode(sentences) -print(embeddings) \ No newline at end of file +for sentence in sentences: + x = text_to_embedding(sentence) + print(x) diff --git a/dsRag/Util/EmbeddingUtil.py b/dsRag/Util/EmbeddingUtil.py new file mode 100644 index 00000000..fde367c0 --- /dev/null +++ b/dsRag/Util/EmbeddingUtil.py @@ -0,0 +1,26 @@ +import logging +import jieba +from gensim.models import KeyedVectors + +# 配置日志 +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +# 初始化 Word2Vec 模型 +model_path = r"D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt" +model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000) +logger.info(f"模型加载成功,词向量维度: {model.vector_size}") + +# 将文本转换为嵌入向量 +def text_to_embedding(text): + words = jieba.lcut(text) # 使用 jieba 分词 + logger.info(f"文本: {text}, 分词结果: {words}") + embeddings = [model[word] for word in words if word in model] + logger.info(f"有效词向量数量: {len(embeddings)}") + if embeddings: + avg_embedding = sum(embeddings) / len(embeddings) + logger.info(f"生成的平均向量: {avg_embedding[:5]}...") # 打印前 5 维 + return avg_embedding + else: + logger.warning("未找到有效词,返回零向量") + return [0.0] * model.vector_size \ No newline at end of file diff --git a/dsRag/Util/__init__.py b/dsRag/Util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc b/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc new file mode 100644 index 00000000..c8bd0082 Binary files /dev/null and b/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc differ diff --git a/dsRag/Util/__pycache__/__init__.cpython-310.pyc b/dsRag/Util/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 00000000..79767003 Binary files /dev/null and b/dsRag/Util/__pycache__/__init__.cpython-310.pyc differ