'commit'

1 month ago · 56e6db5e92
parent ca984af410
commit 56e6db5e92
8 changed files with 121 additions and 300 deletions
--- a/dsRag/T1_ManageMapping.py
+++ b/dsRag/T1_ManageMapping.py
@ -1,7 +1,6 @@
 from elasticsearch import Elasticsearch
 from Config.Config import ES_CONFIG
 from Util.EsMappingUtil import create_vector_index, delete_index, create_text_index, delete_text_index
 # 初始化ES连接
 es = Elasticsearch(
@ -11,65 +10,24 @@ es = Elasticsearch(
    ssl_show_warn=ES_CONFIG["ssl_show_warn"]
 )
-def manage_index(action, index_type="vector", index_name=None, dims=200):
+def get_vector_mapping(dims=200):
-    """管理Elasticsearch索引
+    """获取向量索引的mapping结构"""
-    :param action: 'create'或'delete'
+    return {
    :param index_type: 'vector'或'text'
    :param index_name: 索引名称(默认根据类型自动生成)
    :param dims: 向量维度(仅向量索引有效)
    """
    if index_name is None:
        index_name = "knowledge_base" if index_type == "vector" else "raw_texts"
    if action == "create":
        if index_type == "vector":
            return create_vector_index(index_name, dims)
        else:
            return create_text_index(index_name)
    elif action == "delete":
        if index_type == "vector":
            return delete_index(index_name)
        else:
            return delete_text_index(index_name)
    else:
        raise ValueError("action参数必须是'create'或'delete'")
 # 使用示例
 if __name__ == "__main__":
    # 先删除现有索引（如果存在）
    manage_index("delete", "vector")
    manage_index("delete", "text")
    # 创建新的向量索引
    manage_index("create", "vector", dims=200)
    # 创建新的原始文本索引
    manage_index("create", "text")
 # 修改knowledge_base索引的mapping
 knowledge_base_mapping = {
        "properties": {
        # 在knowledge_base_mapping中添加
            "content": {
                "type": "text",
-            "analyzer": "ik_max_word",
+                "analyzer": "ik_smart",
                "search_analyzer": "ik_smart",
                "fields": {
                    "keyword": {
                        "type": "keyword",
-                    "ignore_above": 8192  # 可以设置为1024/2048等更大值
+                        "ignore_above": 8192
                    }
                }
            },
        # 在raw_texts_mapping中添加
        "raw_text": {
            "type": "text",
            "analyzer": "ik_max_word",
            "fielddata": True  # 允许对长文本进行聚合
        },
            "vector": {
                "type": "dense_vector",
-            "dims": 200,
+                "dims": dims,
                "index": True,
                "similarity": "cosine"
            },
@ -78,18 +36,66 @@ knowledge_base_mapping = {
                "format": "strict_date_optional_time||epoch_millis"
            }
        }
-}
+    }
-# 修改raw_texts索引的mapping
+def get_text_mapping():
-raw_texts_mapping = {
+    """获取文本索引的mapping结构"""
    return {
        "properties": {
            "raw_text": {
                "type": "text",
-            "analyzer": "ik_max_word"
+                "analyzer": "ik_smart",
                "fielddata": True
            },
            "timestamp": {
                "type": "date",
                "format": "strict_date_optional_time||epoch_millis"
            }
        }
-}
+    }
 def manage_index(action, index_type="vector", index_name=None, dims=200):
    """管理Elasticsearch索引"""
    if index_name is None:
        index_name = "knowledge_base" if index_type == "vector" else "raw_texts"
    if action == "create":
        mapping = get_vector_mapping(dims) if index_type == "vector" else get_text_mapping()
        try:
            if es.indices.exists(index=index_name):
                print(f"索引 {index_name} 已存在")
                return False
            es.indices.create(index=index_name, body={"mappings": mapping})
            print(f"索引 {index_name} 创建成功(使用ik_smart分词器)")
            return True
        except Exception as e:
            print(f"创建索引失败: {str(e)}")
            raise
    elif action == "delete":
        try:
            if not es.indices.exists(index=index_name):
                print(f"索引 {index_name} 不存在")
                return False
            es.indices.delete(index=index_name)
            print(f"索引 {index_name} 删除成功")
            return True
        except Exception as e:
            print(f"删除索引失败: {str(e)}")
            raise
    else:
        raise ValueError("action参数必须是'create'或'delete'")
 # 使用示例
 if __name__ == "__main__":
    # 先删除现有索引（如果存在）
    manage_index("delete", "vector")
    manage_index("delete", "text")
    # 创建新的向量索引
    manage_index("create", "vector", dims=200)
    # 创建新的原始文本索引
    manage_index("create", "text")
--- a/dsRag/T2_ImportTxt.py
+++ b/dsRag/T2_ImportTxt.py
@ -1,13 +1,21 @@
 import logging
 from Util.EsMappingUtil import create_vector_index, create_text_index
 from Util.EmbeddingUtil import text_to_embedding  # 修改导入
 from Config.Config import ES_CONFIG
 from elasticsearch import Elasticsearch
 import re
 from tqdm import tqdm
 import datetime
-import numpy as np
+import logging
 # 在文件开头添加logger配置
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 # 创建控制台handler并设置格式
 handler = logging.StreamHandler()
 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 def split_sentences(text):
    """按句分割文本"""
@ -18,30 +26,24 @@ def split_sentences(text):
 def save_to_es(text):
    """保存向量化文本和原始文本到ES"""
-    vector = text_to_embedding(text)  # 修改函数调用
+    vector = text_to_embedding(text)
-    # 检查向量是否有效
+    if vector is None:
-    if vector is None or (hasattr(vector, 'size')) and vector.size == 0:
+        logger.warning(f"跳过无法生成向量的文本: {text}")
-        logging.warning(f"跳过无效向量文本: {text}")
+        return
        return None
    # 检查向量是否全为零或接近零
    if np.all(np.abs(vector) < 1e-6):
        logging.warning(f"跳过零向量文本: {text}")
        return None
    # 归一化向量以避免cosine相似度问题
    norm = np.linalg.norm(vector)
    if norm > 0:
        vector = vector / norm
    doc = {
        'text': text,
-        'vector': vector.tolist(),
+        'vector': vector,
-        'timestamp': datetime.datetime.now().isoformat()
+        'timestamp': datetime.datetime.now().isoformat(),
        'analyzer': 'ik_smart'
    }
    try:
        es.index(index='knowledge_base', body=doc)
-    es.index(index='raw_texts', body={'text': text})
+        es.index(index='raw_texts', body={'raw_text': text})
    except Exception as e:
        logger.error(f"保存文本到ES失败: {e}")
 def process_file(file_path):
    """处理文本文件"""
@ -64,8 +66,5 @@ if __name__ == '__main__':
        ssl_show_warn=ES_CONFIG['ssl_show_warn']
    )
    create_vector_index()
    create_text_index()
    file_path = 'Txt/人口变化趋势对云南教育的影响.txt'
    process_file(file_path)
--- a/dsRag/T3_DeepSeekRag.py
+++ b/dsRag/T3_DeepSeekRag.py
@ -68,16 +68,14 @@ def process_query(query):
 def search_related_data(query):
    """搜索与查询相关的数据"""
    # 向量搜索
    query_vector = text_to_embedding(query)
    vector_results = es.search(
        index=Config.ES_CONFIG['default_index'],
        body={
            "query": {
-                "script_score": {
+                "match": {
-                    "query": {"match_all": {}},
+                    "content": {
-                    "script": {
+                        "query": query,
-                        "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
+                        "analyzer": "ik_smart"  # 指定分词器
                        "params": {"query_vector": query_vector}
                    }
                }
            },
--- a/dsRag/Txt/运行结果.txt
+++ b/dsRag/Txt/运行结果.txt
@ -1,94 +1,18 @@
-D:\anaconda3\envs\rag\python.exe D:\dsWork\dsProject\dsRag\T8_RAG.py
+2025-06-23 21:08:46,327 - INFO - POST https://10.10.14.206:9200/knowledge_base/_doc [status:400 duration:0.004s]
-D:\anaconda3\envs\rag\lib\site-packages\jieba\_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+处理进度:   8%|▊         | 40/503 [00:01<00:17, 26.02句/s]
  import pkg_resources
 2025-06-23 20:13:47,913 - INFO - loading projection weights from D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt
 2025-06-23 20:13:49,382 - INFO - KeyedVectors lifecycle event {'msg': 'loaded (10000, 200) matrix of type float32 from D:\\Tencent_AILab_ChineseEmbedding\\Tencent_AILab_ChineseEmbedding.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-06-23T20:13:49.361378', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:42:04) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'load_word2vec_format'}
 2025-06-23 20:13:49,382 - INFO - 模型加载成功，词向量维度: 200
 D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\__init__.py:311: SecurityWarning: Connecting to 'https://10.10.14.206:9200' using TLS with verify_certs=False is insecure
  _transport = transport_class(
 正在搜索与'整理云南省初中在校生情况文档'相关的数据...
 Building prefix dict from the default dictionary ...
 2025-06-23 20:13:50,520 - DEBUG - Building prefix dict from the default dictionary ...
 Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
 2025-06-23 20:13:50,520 - DEBUG - Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
 Loading model cost 0.648 seconds.
 2025-06-23 20:13:51,168 - DEBUG - Loading model cost 0.648 seconds.
 Prefix dict has been built successfully.
 2025-06-23 20:13:51,169 - DEBUG - Prefix dict has been built successfully.
 2025-06-23 20:13:51,169 - INFO - 文本: 整理云南省初中在校生情况文档, 分词结果: ['整理', '云南省', '初中', '在校生', '情况', '文档']
 2025-06-23 20:13:51,169 - INFO - 有效词向量数量: 3
 2025-06-23 20:13:51,169 - INFO - 生成的平均向量: [ 0.18687634 -0.19447033 -0.04333766 -0.05068566  0.09433967]...
 D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings
  warnings.warn(
 2025-06-23 20:13:51,222 - INFO - POST https://10.10.14.206:9200/knowledge_base/_search [status:200 duration:0.051s]
 D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings
  warnings.warn(
 2025-06-23 20:13:51,231 - INFO - POST https://10.10.14.206:9200/raw_texts/_search [status:200 duration:0.009s]
 找到10条相关数据
 正在生成报告...
 2025-06-23 20:13:56,582 - INFO - Retrying request to /chat/completions in 0.469314 seconds
 2025-06-23 20:14:02,067 - INFO - Retrying request to /chat/completions in 0.762535 seconds
 Traceback (most recent call last):
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 101, in map_httpcore_exceptions
+  File "D:\dsWork\dsProject\dsRag\T2_ImportTxt.py", line 51, in <module>
-    yield
+    process_file(file_path)
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 250, in handle_request
+  File "D:\dsWork\dsProject\dsRag\T2_ImportTxt.py", line 38, in process_file
-    resp = self._pool.handle_request(req)
+    save_to_es(sentence)
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection_pool.py", line 256, in handle_request
+  File "D:\dsWork\dsProject\dsRag\T2_ImportTxt.py", line 26, in save_to_es
-    raise exc from None
+    es.index(index='knowledge_base', body=doc)
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection_pool.py", line 236, in handle_request
+  File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\utils.py", line 415, in wrapped
-    response = connection.handle_request(
+    return api(*args, **kwargs)
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection.py", line 101, in handle_request
+  File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\__init__.py", line 2951, in index
-    raise exc
+    return self.perform_request(  # type: ignore[return-value]
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection.py", line 78, in handle_request
+  File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\_base.py", line 271, in perform_request
-    stream = self._connect(request)
+    response = self._perform_request(
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection.py", line 124, in _connect
+  File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\_base.py", line 351, in _perform_request
-    stream = self._network_backend.connect_tcp(**kwargs)
+    raise HTTP_EXCEPTIONS.get(meta.status, ApiError)(
-  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_backends\sync.py", line 207, in connect_tcp
+elasticsearch.BadRequestError: BadRequestError(400, 'document_parsing_exception', '[1:828] failed to parse: The [cosine] similarity does not support vectors with zero magnitude. Preview of invalid vector: [0.0, 0.0, 0.0, 0.0, 0.0, ...]', The [cosine] similarity does not support vectors with zero magnitude. Preview of invalid vector: [0.0, 0.0, 0.0, 0.0, 0.0, ...])
    with map_exceptions(exc_map):
  File "D:\anaconda3\envs\rag\lib\contextlib.py", line 153, in __exit__
    self.gen.throw(typ, value, traceback)
  File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_exceptions.py", line 14, in map_exceptions
    raise to_exc(exc) from exc
 httpcore.ConnectTimeout: timed out
 The above exception was the direct cause of the following exception:
 Traceback (most recent call last):
  File "D:\anaconda3\envs\rag\lib\site-packages\openai\_base_client.py", line 972, in request
    response = self._client.send(
  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 914, in send
    response = self._send_handling_auth(
  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 942, in _send_handling_auth
    response = self._send_handling_redirects(
  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 979, in _send_handling_redirects
    response = self._send_single_request(request)
  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 1014, in _send_single_request
    response = transport.handle_request(request)
  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 249, in handle_request
    with map_httpcore_exceptions():
  File "D:\anaconda3\envs\rag\lib\contextlib.py", line 153, in __exit__
    self.gen.throw(typ, value, traceback)
  File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 118, in map_httpcore_exceptions
    raise mapped_exc(message) from exc
 httpx.ConnectTimeout: timed out
 The above exception was the direct cause of the following exception:
 Traceback (most recent call last):
  File "D:\dsWork\dsProject\dsRag\T8_RAG.py", line 98, in <module>
    report = process_query(user_query)
  File "D:\dsWork\dsProject\dsRag\T8_RAG.py", line 91, in process_query
    report = generate_report(query, context)
  File "D:\dsWork\dsProject\dsRag\T8_RAG.py", line 73, in generate_report
    response = client.chat.completions.create(
  File "D:\anaconda3\envs\rag\lib\site-packages\openai\_utils\_utils.py", line 287, in wrapper
    return func(*args, **kwargs)
  File "D:\anaconda3\envs\rag\lib\site-packages\openai\resources\chat\completions\completions.py", line 925, in create
    return self._post(
  File "D:\anaconda3\envs\rag\lib\site-packages\openai\_base_client.py", line 1249, in post
    return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
  File "D:\anaconda3\envs\rag\lib\site-packages\openai\_base_client.py", line 990, in request
    raise APITimeoutError(request=request) from err
 openai.APITimeoutError: Request timed out.
 进程已结束，退出代码为 1
--- a/dsRag/Util/EmbeddingUtil.py
+++ b/dsRag/Util/EmbeddingUtil.py
@ -15,6 +15,13 @@ def text_to_embedding(text):
    words = jieba.lcut(text)  # 使用 jieba 分词
    logger.info(f"文本: {text}, 分词结果: {words}")
    embeddings = [model[word] for word in words if word in model]
    if not embeddings:
        logger.warning(f"文本'{text}'无法生成有效向量")
        return None  # 或 raise ValueError("无法生成有效向量")
    avg_embedding = sum(embeddings) / len(embeddings)
    return avg_embedding
    logger.info(f"有效词向量数量: {len(embeddings)}")
    if embeddings:
        avg_embedding = sum(embeddings) / len(embeddings)
--- a/dsRag/Util/EsMappingUtil.py
+++ b/dsRag/Util/EsMappingUtil.py
@ -1,113 +0,0 @@
 from elasticsearch import Elasticsearch
 from Config.Config import ES_CONFIG
 es = Elasticsearch(
    hosts=ES_CONFIG["hosts"],
    basic_auth=ES_CONFIG["basic_auth"],
    verify_certs=ES_CONFIG["verify_certs"],
    ssl_show_warn=ES_CONFIG["ssl_show_warn"]
 )
 def get_vector_mapping(dims=200):
    """获取向量索引的mapping结构"""
    return {
        "mappings": {
            "properties": {
                "text": {"type": "text", "analyzer": "ik_max_word"},
                "vector": {
                    "type": "dense_vector",
                    "dims": dims,
                    "index": True,
                    "similarity": "cosine"
                },
                "timestamp": {"type": "date"}
            }
        }
    }
 def create_vector_index(index_name="knowledge_base", dims=200):
    """创建带有向量字段的索引"""
    mapping = get_vector_mapping(dims)
    try:
        if es.indices.exists(index=index_name):
            current_mapping = es.indices.get_mapping(index=index_name)
            current_dims = current_mapping[index_name]["mappings"]["properties"]["vector"].get("dims", 0)
            if current_dims == dims:
                print(f"索引 {index_name} 已存在且维度正确({dims}维)，无需操作")
                return True
            else:
                print(f"警告：索引 {index_name} 已存在但维度不匹配(当前:{current_dims}维，需要:{dims}维)")
                return False
        es.indices.create(index=index_name, body=mapping)
        print(f"索引 {index_name} 创建成功({dims}维)")
        return True
    except Exception as e:
        print(f"操作索引失败: {str(e)}")
        raise
 def delete_index(index_name):
    """删除Elasticsearch索引"""
    try:
        if es.indices.exists(index=index_name):
            es.indices.delete(index=index_name)
            print(f"索引 {index_name} 删除成功")
            return True
        else:
            print(f"索引 {index_name} 不存在")
            return False
    except Exception as e:
        print(f"删除索引失败: {str(e)}")
        raise
 def create_text_index(index_name="raw_texts"):
    """创建原始文本索引"""
    mapping = {
        "mappings": {
            "properties": {
                "text": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "timestamp": {
                    "type": "date"
                }
            }
        }
    }
    try:
        if not es.indices.exists(index=index_name):
            es.indices.create(index=index_name, body=mapping)
            print(f"原始文本索引 {index_name} 创建成功")
            return True
        else:
            print(f"原始文本索引 {index_name} 已存在")
            return False
    except Exception as e:
        print(f"创建原始文本索引失败: {str(e)}")
        raise
 def delete_text_index(index_name="raw_texts"):
    """删除原始文本索引"""
    try:
        if es.indices.exists(index=index_name):
            es.indices.delete(index=index_name)
            print(f"原始文本索引 {index_name} 删除成功")
            return True
        else:
            print(f"原始文本索引 {index_name} 不存在")
            return False
    except Exception as e:
        print(f"删除原始文本索引失败: {str(e)}")
        raise
--- a/dsRag/Util/pycache/EmbeddingUtil.cpython-310.pyc
+++ b/dsRag/Util/pycache/EmbeddingUtil.cpython-310.pyc
--- a/dsRag/Util/pycache/EsMappingUtil.cpython-310.pyc
+++ b/dsRag/Util/pycache/EsMappingUtil.cpython-310.pyc