diff --git a/dsRag/T4_VectorSave.py b/dsRag/T4_VectorSave.py index 4d49baf2..712df513 100644 --- a/dsRag/T4_VectorSave.py +++ b/dsRag/T4_VectorSave.py @@ -4,6 +4,7 @@ from elasticsearch import Elasticsearch from Config.Config import ES_CONFIG from T2_Txt2Vec import text_to_embedding +from Util.EsMappingUtil import create_vector_index # 导入工具函数 # 初始化ES连接 es = Elasticsearch( @@ -13,34 +14,25 @@ es = Elasticsearch( ssl_show_warn=ES_CONFIG["ssl_show_warn"] ) -def create_vector_index(index_name="knowledge_base"): - """创建带有向量字段的索引(适配200维腾讯词向量)""" - mapping = { - "mappings": { - "properties": { - "text": {"type": "text", "analyzer": "ik_max_word"}, - "vector": { - "type": "dense_vector", - "dims": 200, # 修改为腾讯词向量实际维度 - "index": True, - "similarity": "cosine" - }, - "timestamp": {"type": "date"} +def save_to_es(text, index_name="knowledge_base"): + """将文本向量化后保存到ES""" + # 检查是否已存在相同文本 + query = { + "query": { + "term": { + "text.keyword": { + "value": text + } } } } - try: - if es.indices.exists(index=index_name): - es.indices.delete(index=index_name) - es.indices.create(index=index_name, body=mapping) - print(f"索引 {index_name} 创建成功(200维)") - except Exception as e: - print(f"创建索引失败: {str(e)}") - raise - -def save_to_es(text, index_name="knowledge_base"): - """将文本向量化后保存到ES""" + exists = es.search(index=index_name, body=query) + if exists["hits"]["total"]["value"] > 0: + print(f"文档已存在,跳过保存: {text}") + return exists["hits"]["hits"][0]["_id"] # 返回现有文档ID + + # 保存新文档 vector = text_to_embedding(text) doc = { "text": text, @@ -51,12 +43,12 @@ def save_to_es(text, index_name="knowledge_base"): try: res = es.index(index=index_name, document=doc) print(f"文档已保存,ID: {res['_id']}") - return res + return res["_id"] except Exception as e: print(f"保存到ES失败: {str(e)}") raise # 使用示例 if __name__ == "__main__": - create_vector_index() # 首次运行前执行 + create_vector_index(dims=200) # 使用工具函数创建索引 save_to_es("如何更换支付宝绑定银行卡") \ No newline at end of file diff --git a/dsRag/Util/EsMappingUtil.py b/dsRag/Util/EsMappingUtil.py new file mode 100644 index 00000000..7aa01d88 --- /dev/null +++ b/dsRag/Util/EsMappingUtil.py @@ -0,0 +1,49 @@ +from elasticsearch import Elasticsearch +from Config.Config import ES_CONFIG + +es = Elasticsearch( + hosts=ES_CONFIG["hosts"], + basic_auth=ES_CONFIG["basic_auth"], + verify_certs=ES_CONFIG["verify_certs"], + ssl_show_warn=ES_CONFIG["ssl_show_warn"] +) + +def get_vector_mapping(dims=200): + """获取向量索引的mapping结构""" + return { + "mappings": { + "properties": { + "text": {"type": "text", "analyzer": "ik_max_word"}, + "vector": { + "type": "dense_vector", + "dims": dims, + "index": True, + "similarity": "cosine" + }, + "timestamp": {"type": "date"} + } + } + } + +def create_vector_index(index_name="knowledge_base", dims=200): + """创建带有向量字段的索引""" + mapping = get_vector_mapping(dims) + + try: + if es.indices.exists(index=index_name): + current_mapping = es.indices.get_mapping(index=index_name) + current_dims = current_mapping[index_name]["mappings"]["properties"]["vector"].get("dims", 0) + + if current_dims == dims: + print(f"索引 {index_name} 已存在且维度正确({dims}维),无需操作") + return True + else: + print(f"警告:索引 {index_name} 已存在但维度不匹配(当前:{current_dims}维,需要:{dims}维)") + return False + + es.indices.create(index=index_name, body=mapping) + print(f"索引 {index_name} 创建成功({dims}维)") + return True + except Exception as e: + print(f"操作索引失败: {str(e)}") + raise \ No newline at end of file diff --git a/dsRag/__pycache__/T2_Txt2Vec.cpython-310.pyc b/dsRag/__pycache__/T2_Txt2Vec.cpython-310.pyc new file mode 100644 index 00000000..d6e0b8bd Binary files /dev/null and b/dsRag/__pycache__/T2_Txt2Vec.cpython-310.pyc differ