From 0335b8c17aefb1f7b571b97629d08d67023a9170 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 23 Jun 2025 16:10:04 +0800 Subject: [PATCH] 'commit' --- dsRag/T2_Txt2Vec.py | 18 +++++------- dsRag/Util/EmbeddingUtil.py | 26 ++++++++++++++++++ dsRag/Util/__init__.py | 0 .../__pycache__/EmbeddingUtil.cpython-310.pyc | Bin 0 -> 1261 bytes .../Util/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 135 bytes 5 files changed, 33 insertions(+), 11 deletions(-) create mode 100644 dsRag/Util/EmbeddingUtil.py create mode 100644 dsRag/Util/__init__.py create mode 100644 dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc create mode 100644 dsRag/Util/__pycache__/__init__.cpython-310.pyc diff --git a/dsRag/T2_Txt2Vec.py b/dsRag/T2_Txt2Vec.py index fc99afa3..77e28f7c 100644 --- a/dsRag/T2_Txt2Vec.py +++ b/dsRag/T2_Txt2Vec.py @@ -2,19 +2,15 @@ # 安装向量化的包 (# 断开VPN后执行安装包) conda activate rag pip install text2vec torch torchvision torchaudio + +# 安装完整版本(包含额外依赖) +pip install gensim[complete] ''' +from Util.EmbeddingUtil import * -from text2vec import SentenceModel sentences = ['如何更换花呗绑定银行卡', '花呗更改绑定银行卡'] -''' -- 自动下载预训练模型到缓存目录(通常是 ~/.cache/huggingface/hub ) -- 后续运行会直接使用缓存 -如果下载慢,可以设置镜像源: -import os -os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' -''' -model = SentenceModel('shibing624/text2vec-base-chinese') -embeddings = model.encode(sentences) -print(embeddings) \ No newline at end of file +for sentence in sentences: + x = text_to_embedding(sentence) + print(x) diff --git a/dsRag/Util/EmbeddingUtil.py b/dsRag/Util/EmbeddingUtil.py new file mode 100644 index 00000000..fde367c0 --- /dev/null +++ b/dsRag/Util/EmbeddingUtil.py @@ -0,0 +1,26 @@ +import logging +import jieba +from gensim.models import KeyedVectors + +# 配置日志 +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +# 初始化 Word2Vec 模型 +model_path = r"D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt" +model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000) +logger.info(f"模型加载成功,词向量维度: {model.vector_size}") + +# 将文本转换为嵌入向量 +def text_to_embedding(text): + words = jieba.lcut(text) # 使用 jieba 分词 + logger.info(f"文本: {text}, 分词结果: {words}") + embeddings = [model[word] for word in words if word in model] + logger.info(f"有效词向量数量: {len(embeddings)}") + if embeddings: + avg_embedding = sum(embeddings) / len(embeddings) + logger.info(f"生成的平均向量: {avg_embedding[:5]}...") # 打印前 5 维 + return avg_embedding + else: + logger.warning("未找到有效词,返回零向量") + return [0.0] * model.vector_size \ No newline at end of file diff --git a/dsRag/Util/__init__.py b/dsRag/Util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc b/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8bd0082dc45d4595fb621727f07718c1a9540e1 GIT binary patch literal 1261 zcmah|&2Jk;6rb4-d%d<(Q$C8B7 ztj(!`q=AT5v>eDsol8{^5|uzmr4;^;y&<%l=ENN?yjfGXf~vZSeXV^@)={U8a8MjQoId?zphqP!c`gObQDT3mnm(nQ6^` zNW%^dg?;ASd8chpFM1*6^qrth3GqVrIonNfRh;!|uOS4<3vDlS*a}?Y1)k`=3gu?^ z_S3!3(#^r+M~||N8|mhqKY#iB@%LNljhlz-UmQIAG5z^&tKPv9+P@7LM+NB9(D^19 zfd)86`*;-#WuOw6Zv)3M29~s(C17_A#dro?!>dXjqYEuouE~g;$5>%4q05!w`pall z&EpEO3VIbkQI5-;#;z*!#p)b_-2y2MHE^*gc8zfa8kA>y22_-NwVv&K*Q)nw;JjW> zH$DYO2M@o@_ICaQMu7kcXbw6*0mel!x@4~67z?zFm?EI?)Gv%k!0EJ`yd=ZmwN}d z?trKVU$3PPen@xMhoZLXhw_8;)n>EVdnM2CZFc?FbmRUgm3#xg-AcFj4!3_A9&#Zr zZ;eu-cRkv6EY0uqL{ji0*QG4cyl^2(RG)^4%6malyyCDB-Y1iKB<1D&GuYhg(}tF4 z5?-Q9^0;M=FmkKvEW1xJKP8rxWtJEI$I1~{=Hp;Y{30I1<5j=}}cO|)~fXWz3*E*0m8&Q)oCbvU@Ym*$nS6RomX??C43zw}A;?>Te(eS8z== Mv8LAWQCv~~0Uz~w*8l(j literal 0 HcmV?d00001 diff --git a/dsRag/Util/__pycache__/__init__.cpython-310.pyc b/dsRag/Util/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79767003c9ac32bf4142003b854ef98e2eacfc07 GIT binary patch literal 135 zcmd1j<>g`kf?)2*3=sVoL?8o3AjbiSi&=m~3PUi1CZpd