From 1b38446d5f7841216dbb8afeea4a4bfa054867f8 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 23 Jun 2025 19:43:21 +0800 Subject: [PATCH] 'commit' --- .../{T5_VectorSave.py => T5_SentenceSave.py} | 50 ++++++++++++++---- dsRag/Util/EsMappingUtil.py | 8 ++- .../__pycache__/EsMappingUtil.cpython-310.pyc | Bin 1930 -> 2728 bytes 3 files changed, 47 insertions(+), 11 deletions(-) rename dsRag/{T5_VectorSave.py => T5_SentenceSave.py} (55%) diff --git a/dsRag/T5_VectorSave.py b/dsRag/T5_SentenceSave.py similarity index 55% rename from dsRag/T5_VectorSave.py rename to dsRag/T5_SentenceSave.py index 31dd2ad8..92198c03 100644 --- a/dsRag/T5_VectorSave.py +++ b/dsRag/T5_SentenceSave.py @@ -3,7 +3,6 @@ from elasticsearch import Elasticsearch from Config.Config import ES_CONFIG from T2_Txt2Vec import text_to_embedding -from Util.EsMappingUtil import create_vector_index # 初始化ES连接 es = Elasticsearch( @@ -13,6 +12,29 @@ es = Elasticsearch( ssl_show_warn=ES_CONFIG["ssl_show_warn"] ) +def check_text_exists(text, index_name="raw_texts"): + """检查文本是否已存在""" + try: + # 添加长度验证 + if len(text) > 256: + text = text[:256] + + query = { + "query": { + "bool": { + "must": [ + {"term": {"text.keyword": text}}, + {"match": {"text": text}} + ] + } + } + } + res = es.search(index=index_name, body=query, size=1) + return res['hits']['hits'][0]['_id'] if res['hits']['hits'] else None + except Exception as e: + print(f"检查失败: {str(e)}") + return None + def save_vector(text, index_name="knowledge_base"): """将文本向量化后保存到ES""" try: @@ -51,16 +73,26 @@ def save_raw_text(text, index_name="raw_texts"): print(f"保存失败: {str(e)}") raise +def save_text_with_check(text): + """带检查的保存流程""" + # 检查文本是否已存在 + existing_id = check_text_exists(text) + if existing_id: + print(f"文本已存在,ID: {existing_id}") + return existing_id + + # 保存向量 + vector_id = save_vector(text) + + # 保存原始文本 + raw_id = save_raw_text(text) + + return {"vector_id": vector_id, "raw_id": raw_id} + # 使用示例 if __name__ == "__main__": - # 创建向量索引 - create_vector_index(dims=200) - # 示例文本 sample_text = "如何更换支付宝绑定银行卡" - # 保存向量化文档 - save_vector(sample_text) - - # 保存原始文本 - save_raw_text(sample_text) + # 带检查的保存 + save_text_with_check(sample_text) diff --git a/dsRag/Util/EsMappingUtil.py b/dsRag/Util/EsMappingUtil.py index 9dbf66cf..b6f6a85d 100644 --- a/dsRag/Util/EsMappingUtil.py +++ b/dsRag/Util/EsMappingUtil.py @@ -71,8 +71,12 @@ def create_text_index(index_name="raw_texts"): "properties": { "text": { "type": "text", - "analyzer": "ik_max_word", - "search_analyzer": "ik_smart" + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } }, "timestamp": { "type": "date" diff --git a/dsRag/Util/__pycache__/EsMappingUtil.cpython-310.pyc b/dsRag/Util/__pycache__/EsMappingUtil.cpython-310.pyc index ca99ac228af4900e552f1268a17f8977d8054120..2eaa5fd0209b92f3653dcbdd8db6d3b09751fc1d 100644 GIT binary patch delta 1057 zcmZ{j-A@xi5Wsi#+Pk*2*H1nLQxVXUh?)owMjwoc4?ge$i4h+NvZodajoYgQQmhdY z8w^I$_!Wv6#R!Q$fEW|?FX{VLc_#P>&a?#t<6iD}vokw;voo{P{-tU??(=yVj*Azr z-F~6Z>Wwfw+kU>YM!1D%3i_O)R(OREB~|FcKgIeqQ73{Tgc=u0VG%*eEvkE%QNNNc zDzN-H?x=yo$8gUJgh^#J@j|CM%Vr zxGI%PQXEk`^L5q!^wQ2RT~0>H_dlWAiz~m@-k*zh8wX1*kD%LuIvYy4&aGr>BT!sr zL{KUO2ch~$D;zit;NaNe+m0=V2l*S{Z64MkfR73Ru7VCB zT;qDcw`3Ikf%xI$ptl|p>(sV$wpUH(tRl%S9ayD SXl>AtyQ|fzt~!rjZ~g-v+ZY=F delta 242 zcmZ1>+QrY8&&$ij00iBZkr^8}Ci2NJMoiSUQ4~z!OyO!_jABg@O5sW21+v&u_)_>= z7^2uy1cDhfg*VQ5#>}X>xtT?bkx^~3AEyjs@#O8SOGHX}frcqO?O5<~=8|U}6P|X> z4=J6T&!%ap$zCJ{lq!-25i&sH7F$YcPHIVNkvvF55kx2f2~DOVWe{6+@WU zfd;>0lV#MH%*Z9d$T3-wOH&kVkSIu%8i>#U5yD{gHC%jBVn8kr0}mq;6B{EG{^4Th I66BEp0CY|@l>h($