'commit'

1 month ago · 56e6db5e92
parent ca984af410
commit 56e6db5e92
8 changed files with 121 additions and 300 deletions
--- a/dsRag/T1_ManageMapping.py
+++ b/dsRag/T1_ManageMapping.py
@ -1,7 +1,6 @@
 from elasticsearch import Elasticsearch

 from Config.Config import ES_CONFIG
-from Util.EsMappingUtil import create_vector_index, delete_index, create_text_index, delete_text_index

 # 初始化ES连接
 es = Elasticsearch(
@ -11,26 +10,81 @@ es = Elasticsearch(
    ssl_show_warn=ES_CONFIG["ssl_show_warn"]
 )

+def get_vector_mapping(dims=200):
+    """获取向量索引的mapping结构"""
+    return {
+        "properties": {
+            "content": {
+                "type": "text",
+                "analyzer": "ik_smart",
+                "search_analyzer": "ik_smart",
+                "fields": {
+                    "keyword": {
+                        "type": "keyword",
+                        "ignore_above": 8192
+                    }
+                }
+            },
+            "vector": {
+                "type": "dense_vector",
+                "dims": dims,
+                "index": True,
+                "similarity": "cosine"
+            },
+            "timestamp": {
+                "type": "date",
+                "format": "strict_date_optional_time||epoch_millis"
+            }
+        }
+    }
+
+def get_text_mapping():
+    """获取文本索引的mapping结构"""
+    return {
+        "properties": {
+            "raw_text": {
+                "type": "text",
+                "analyzer": "ik_smart",
+                "fielddata": True
+            },
+            "timestamp": {
+                "type": "date",
+                "format": "strict_date_optional_time||epoch_millis"
+            }
+        }
+    }
+
 def manage_index(action, index_type="vector", index_name=None, dims=200):
-    """管理Elasticsearch索引
-    :param action: 'create'或'delete'
-    :param index_type: 'vector'或'text'
-    :param index_name: 索引名称(默认根据类型自动生成)
-    :param dims: 向量维度(仅向量索引有效)
-    """
+    """管理Elasticsearch索引"""
    if index_name is None:
        index_name = "knowledge_base" if index_type == "vector" else "raw_texts"
    
    if action == "create":
-        if index_type == "vector":
-            return create_vector_index(index_name, dims)
-        else:
-            return create_text_index(index_name)
+        mapping = get_vector_mapping(dims) if index_type == "vector" else get_text_mapping()
+        
+        try:
+            if es.indices.exists(index=index_name):
+                print(f"索引 {index_name} 已存在")
+                return False
+                
+            es.indices.create(index=index_name, body={"mappings": mapping})
+            print(f"索引 {index_name} 创建成功(使用ik_smart分词器)")
+            return True
+        except Exception as e:
+            print(f"创建索引失败: {str(e)}")
+            raise
    elif action == "delete":
-        if index_type == "vector":
-            return delete_index(index_name)
-        else:
-            return delete_text_index(index_name)
+        try:
+            if not es.indices.exists(index=index_name):
+                print(f"索引 {index_name} 不存在")
+                return False
+                
+            es.indices.delete(index=index_name)
+            print(f"索引 {index_name} 删除成功")
+            return True
+        except Exception as e:
+            print(f"删除索引失败: {str(e)}")
+            raise
    else:
        raise ValueError("action参数必须是'create'或'delete'")

@ -45,51 +99,3 @@ if __name__ == "__main__":
    
    # 创建新的原始文本索引
    manage_index("create", "text")
-
-# 修改knowledge_base索引的mapping
-knowledge_base_mapping = {
-    "properties": {
-        # 在knowledge_base_mapping中添加
-        "content": {
-            "type": "text",
-            "analyzer": "ik_max_word",
-            "search_analyzer": "ik_smart",
-            "fields": {
-                "keyword": {
-                    "type": "keyword",
-                    "ignore_above": 8192  # 可以设置为1024/2048等更大值
-                }
-            }
-        },
-        # 在raw_texts_mapping中添加
-        "raw_text": {
-            "type": "text",
-            "analyzer": "ik_max_word",
-            "fielddata": True  # 允许对长文本进行聚合
-        },
-        "vector": {
-            "type": "dense_vector",
-            "dims": 200,
-            "index": True,
-            "similarity": "cosine"
-        },
-        "timestamp": {
-            "type": "date",
-            "format": "strict_date_optional_time||epoch_millis"
-        }
-    }
-}
-
-# 修改raw_texts索引的mapping
-raw_texts_mapping = {
-    "properties": {
-        "raw_text": {
-            "type": "text",
-            "analyzer": "ik_max_word"
-        },
-        "timestamp": {
-            "type": "date",
-            "format": "strict_date_optional_time||epoch_millis"
-        }
-    }
-}
--- a/dsRag/T2_ImportTxt.py
+++ b/dsRag/T2_ImportTxt.py
@ -1,13 +1,21 @@
-import logging

-from Util.EsMappingUtil import create_vector_index, create_text_index
 from Util.EmbeddingUtil import text_to_embedding  # 修改导入
 from Config.Config import ES_CONFIG
 from elasticsearch import Elasticsearch
 import re
 from tqdm import tqdm
 import datetime
-import numpy as np
+import logging
+
+# 在文件开头添加logger配置
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# 创建控制台handler并设置格式
+handler = logging.StreamHandler()
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)

 def split_sentences(text):
    """按句分割文本"""
@ -18,30 +26,24 @@ def split_sentences(text):

 def save_to_es(text):
    """保存向量化文本和原始文本到ES"""
-    vector = text_to_embedding(text)  # 修改函数调用
+    vector = text_to_embedding(text)
    
-    # 检查向量是否有效
-    if vector is None or (hasattr(vector, 'size')) and vector.size == 0:
-        logging.warning(f"跳过无效向量文本: {text}")
-        return None
-    
-    # 检查向量是否全为零或接近零
-    if np.all(np.abs(vector) < 1e-6):
-        logging.warning(f"跳过零向量文本: {text}")
-        return None
-    
-    # 归一化向量以避免cosine相似度问题
-    norm = np.linalg.norm(vector)
-    if norm > 0:
-        vector = vector / norm
+    if vector is None:
+        logger.warning(f"跳过无法生成向量的文本: {text}")
+        return
    
    doc = {
        'text': text,
-        'vector': vector.tolist(),
-        'timestamp': datetime.datetime.now().isoformat()
+        'vector': vector,
+        'timestamp': datetime.datetime.now().isoformat(),
+        'analyzer': 'ik_smart'
    }
-    es.index(index='knowledge_base', body=doc)
-    es.index(index='raw_texts', body={'text': text})
+    
+    try:
+        es.index(index='knowledge_base', body=doc)
+        es.index(index='raw_texts', body={'raw_text': text})
+    except Exception as e:
+        logger.error(f"保存文本到ES失败: {e}")

 def process_file(file_path):
    """处理文本文件"""
@ -64,8 +66,5 @@ if __name__ == '__main__':
        ssl_show_warn=ES_CONFIG['ssl_show_warn']
    )
    
-    create_vector_index()
-    create_text_index()
-    
    file_path = 'Txt/人口变化趋势对云南教育的影响.txt'
    process_file(file_path)
--- a/dsRag/T3_DeepSeekRag.py
+++ b/dsRag/T3_DeepSeekRag.py
@ -68,16 +68,14 @@ def process_query(query):
 def search_related_data(query):
    """搜索与查询相关的数据"""
    # 向量搜索
-    query_vector = text_to_embedding(query)
    vector_results = es.search(
        index=Config.ES_CONFIG['default_index'],
        body={
            "query": {
-                "script_score": {
-                    "query": {"match_all": {}},
-                    "script": {
-                        "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
-                        "params": {"query_vector": query_vector}
+                "match": {
+                    "content": {
+                        "query": query,
+                        "analyzer": "ik_smart"  # 指定分词器
                    }
                }
            },
--- a/dsRag/Txt/运行结果.txt
+++ b/dsRag/Txt/运行结果.txt
@ -1,94 +1,18 @@
-D:\anaconda3\envs\rag\python.exe D:\dsWork\dsProject\dsRag\T8_RAG.py
-D:\anaconda3\envs\rag\lib\site-packages\jieba\_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
-  import pkg_resources
-2025-06-23 20:13:47,913 - INFO - loading projection weights from D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt
-2025-06-23 20:13:49,382 - INFO - KeyedVectors lifecycle event {'msg': 'loaded (10000, 200) matrix of type float32 from D:\\Tencent_AILab_ChineseEmbedding\\Tencent_AILab_ChineseEmbedding.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-06-23T20:13:49.361378', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:42:04) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'load_word2vec_format'}
-2025-06-23 20:13:49,382 - INFO - 模型加载成功，词向量维度: 200
-D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\__init__.py:311: SecurityWarning: Connecting to 'https://10.10.14.206:9200' using TLS with verify_certs=False is insecure
-  _transport = transport_class(
-正在搜索与'整理云南省初中在校生情况文档'相关的数据...
-Building prefix dict from the default dictionary ...
-2025-06-23 20:13:50,520 - DEBUG - Building prefix dict from the default dictionary ...
-Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
-2025-06-23 20:13:50,520 - DEBUG - Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
-Loading model cost 0.648 seconds.
-2025-06-23 20:13:51,168 - DEBUG - Loading model cost 0.648 seconds.
-Prefix dict has been built successfully.
-2025-06-23 20:13:51,169 - DEBUG - Prefix dict has been built successfully.
-2025-06-23 20:13:51,169 - INFO - 文本: 整理云南省初中在校生情况文档, 分词结果: ['整理', '云南省', '初中', '在校生', '情况', '文档']
-2025-06-23 20:13:51,169 - INFO - 有效词向量数量: 3
-2025-06-23 20:13:51,169 - INFO - 生成的平均向量: [ 0.18687634 -0.19447033 -0.04333766 -0.05068566  0.09433967]...
-D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings
-  warnings.warn(
-2025-06-23 20:13:51,222 - INFO - POST https://10.10.14.206:9200/knowledge_base/_search [status:200 duration:0.051s]
-D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings
-  warnings.warn(
-2025-06-23 20:13:51,231 - INFO - POST https://10.10.14.206:9200/raw_texts/_search [status:200 duration:0.009s]
-找到10条相关数据
-正在生成报告...
-2025-06-23 20:13:56,582 - INFO - Retrying request to /chat/completions in 0.469314 seconds
-2025-06-23 20:14:02,067 - INFO - Retrying request to /chat/completions in 0.762535 seconds
+2025-06-23 21:08:46,327 - INFO - POST https://10.10.14.206:9200/knowledge_base/_doc [status:400 duration:0.004s]
+处理进度:   8%|▊         | 40/503 [00:01<00:17, 26.02句/s]
 Traceback (most recent call last):
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 101, in map_httpcore_exceptions
-    yield
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 250, in handle_request
-    resp = self._pool.handle_request(req)
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection_pool.py", line 256, in handle_request
-    raise exc from None
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection_pool.py", line 236, in handle_request
-    response = connection.handle_request(
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection.py", line 101, in handle_request
-    raise exc
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection.py", line 78, in handle_request
-    stream = self._connect(request)
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection.py", line 124, in _connect
-    stream = self._network_backend.connect_tcp(**kwargs)
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_backends\sync.py", line 207, in connect_tcp
-    with map_exceptions(exc_map):
-  File "D:\anaconda3\envs\rag\lib\contextlib.py", line 153, in __exit__
-    self.gen.throw(typ, value, traceback)
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_exceptions.py", line 14, in map_exceptions
-    raise to_exc(exc) from exc
-httpcore.ConnectTimeout: timed out
-
-The above exception was the direct cause of the following exception:
-
-Traceback (most recent call last):
-  File "D:\anaconda3\envs\rag\lib\site-packages\openai\_base_client.py", line 972, in request
-    response = self._client.send(
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 914, in send
-    response = self._send_handling_auth(
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 942, in _send_handling_auth
-    response = self._send_handling_redirects(
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 979, in _send_handling_redirects
-    response = self._send_single_request(request)
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 1014, in _send_single_request
-    response = transport.handle_request(request)
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 249, in handle_request
-    with map_httpcore_exceptions():
-  File "D:\anaconda3\envs\rag\lib\contextlib.py", line 153, in __exit__
-    self.gen.throw(typ, value, traceback)
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 118, in map_httpcore_exceptions
-    raise mapped_exc(message) from exc
-httpx.ConnectTimeout: timed out
-
-The above exception was the direct cause of the following exception:
-
-Traceback (most recent call last):
-  File "D:\dsWork\dsProject\dsRag\T8_RAG.py", line 98, in <module>
-    report = process_query(user_query)
-  File "D:\dsWork\dsProject\dsRag\T8_RAG.py", line 91, in process_query
-    report = generate_report(query, context)
-  File "D:\dsWork\dsProject\dsRag\T8_RAG.py", line 73, in generate_report
-    response = client.chat.completions.create(
-  File "D:\anaconda3\envs\rag\lib\site-packages\openai\_utils\_utils.py", line 287, in wrapper
-    return func(*args, **kwargs)
-  File "D:\anaconda3\envs\rag\lib\site-packages\openai\resources\chat\completions\completions.py", line 925, in create
-    return self._post(
-  File "D:\anaconda3\envs\rag\lib\site-packages\openai\_base_client.py", line 1249, in post
-    return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
-  File "D:\anaconda3\envs\rag\lib\site-packages\openai\_base_client.py", line 990, in request
-    raise APITimeoutError(request=request) from err
-openai.APITimeoutError: Request timed out.
-
-进程已结束，退出代码为 1
+  File "D:\dsWork\dsProject\dsRag\T2_ImportTxt.py", line 51, in <module>
+    process_file(file_path)
+  File "D:\dsWork\dsProject\dsRag\T2_ImportTxt.py", line 38, in process_file
+    save_to_es(sentence)
+  File "D:\dsWork\dsProject\dsRag\T2_ImportTxt.py", line 26, in save_to_es
+    es.index(index='knowledge_base', body=doc)
+  File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\utils.py", line 415, in wrapped
+    return api(*args, **kwargs)
+  File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\__init__.py", line 2951, in index
+    return self.perform_request(  # type: ignore[return-value]
+  File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\_base.py", line 271, in perform_request
+    response = self._perform_request(
+  File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\_base.py", line 351, in _perform_request
+    raise HTTP_EXCEPTIONS.get(meta.status, ApiError)(
+elasticsearch.BadRequestError: BadRequestError(400, 'document_parsing_exception', '[1:828] failed to parse: The [cosine] similarity does not support vectors with zero magnitude. Preview of invalid vector: [0.0, 0.0, 0.0, 0.0, 0.0, ...]', The [cosine] similarity does not support vectors with zero magnitude. Preview of invalid vector: [0.0, 0.0, 0.0, 0.0, 0.0, ...])
--- a/dsRag/Util/EmbeddingUtil.py
+++ b/dsRag/Util/EmbeddingUtil.py
@ -15,6 +15,13 @@ def text_to_embedding(text):
    words = jieba.lcut(text)  # 使用 jieba 分词
    logger.info(f"文本: {text}, 分词结果: {words}")
    embeddings = [model[word] for word in words if word in model]
+    
+    if not embeddings:
+        logger.warning(f"文本'{text}'无法生成有效向量")
+        return None  # 或 raise ValueError("无法生成有效向量")
+        
+    avg_embedding = sum(embeddings) / len(embeddings)
+    return avg_embedding
    logger.info(f"有效词向量数量: {len(embeddings)}")
    if embeddings:
        avg_embedding = sum(embeddings) / len(embeddings)
--- a/dsRag/Util/EsMappingUtil.py
+++ b/dsRag/Util/EsMappingUtil.py
@ -1,113 +0,0 @@
-from elasticsearch import Elasticsearch
-from Config.Config import ES_CONFIG
-
-es = Elasticsearch(
-    hosts=ES_CONFIG["hosts"],
-    basic_auth=ES_CONFIG["basic_auth"],
-    verify_certs=ES_CONFIG["verify_certs"],
-    ssl_show_warn=ES_CONFIG["ssl_show_warn"]
-)
-
-def get_vector_mapping(dims=200):
-    """获取向量索引的mapping结构"""
-    return {
-        "mappings": {
-            "properties": {
-                "text": {"type": "text", "analyzer": "ik_max_word"},
-                "vector": {
-                    "type": "dense_vector",
-                    "dims": dims,
-                    "index": True,
-                    "similarity": "cosine"
-                },
-                "timestamp": {"type": "date"}
-            }
-        }
-    }
-
-def create_vector_index(index_name="knowledge_base", dims=200):
-    """创建带有向量字段的索引"""
-    mapping = get_vector_mapping(dims)
-    
-    try:
-        if es.indices.exists(index=index_name):
-            current_mapping = es.indices.get_mapping(index=index_name)
-            current_dims = current_mapping[index_name]["mappings"]["properties"]["vector"].get("dims", 0)
-            
-            if current_dims == dims:
-                print(f"索引 {index_name} 已存在且维度正确({dims}维)，无需操作")
-                return True
-            else:
-                print(f"警告：索引 {index_name} 已存在但维度不匹配(当前:{current_dims}维，需要:{dims}维)")
-                return False
-                
-        es.indices.create(index=index_name, body=mapping)
-        print(f"索引 {index_name} 创建成功({dims}维)")
-        return True
-    except Exception as e:
-        print(f"操作索引失败: {str(e)}")
-        raise
-
-
-def delete_index(index_name):
-    """删除Elasticsearch索引"""
-    try:
-        if es.indices.exists(index=index_name):
-            es.indices.delete(index=index_name)
-            print(f"索引 {index_name} 删除成功")
-            return True
-        else:
-            print(f"索引 {index_name} 不存在")
-            return False
-    except Exception as e:
-        print(f"删除索引失败: {str(e)}")
-        raise
-
-
-def create_text_index(index_name="raw_texts"):
-    """创建原始文本索引"""
-    mapping = {
-        "mappings": {
-            "properties": {
-                "text": {
-                    "type": "text",
-                    "fields": {
-                        "keyword": {
-                            "type": "keyword",
-                            "ignore_above": 256
-                        }
-                    }
-                },
-                "timestamp": {
-                    "type": "date"
-                }
-            }
-        }
-    }
-    
-    try:
-        if not es.indices.exists(index=index_name):
-            es.indices.create(index=index_name, body=mapping)
-            print(f"原始文本索引 {index_name} 创建成功")
-            return True
-        else:
-            print(f"原始文本索引 {index_name} 已存在")
-            return False
-    except Exception as e:
-        print(f"创建原始文本索引失败: {str(e)}")
-        raise
-
-
-def delete_text_index(index_name="raw_texts"):
-    """删除原始文本索引"""
-    try:
-        if es.indices.exists(index=index_name):
-            es.indices.delete(index=index_name)
-            print(f"原始文本索引 {index_name} 删除成功")
-            return True
-        else:
-            print(f"原始文本索引 {index_name} 不存在")
-            return False
-    except Exception as e:
-        print(f"删除原始文本索引失败: {str(e)}")
-        raise
--- a/dsRag/Util/pycache/EmbeddingUtil.cpython-310.pyc
+++ b/dsRag/Util/pycache/EmbeddingUtil.cpython-310.pyc
--- a/dsRag/Util/pycache/EsMappingUtil.cpython-310.pyc
+++ b/dsRag/Util/pycache/EsMappingUtil.cpython-310.pyc