main
HuangHai 1 month ago
parent ca984af410
commit 56e6db5e92

@ -1,7 +1,6 @@
from elasticsearch import Elasticsearch
from Config.Config import ES_CONFIG
from Util.EsMappingUtil import create_vector_index, delete_index, create_text_index, delete_text_index
# 初始化ES连接
es = Elasticsearch(
@ -11,26 +10,81 @@ es = Elasticsearch(
ssl_show_warn=ES_CONFIG["ssl_show_warn"]
)
def get_vector_mapping(dims=200):
"""获取向量索引的mapping结构"""
return {
"properties": {
"content": {
"type": "text",
"analyzer": "ik_smart",
"search_analyzer": "ik_smart",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 8192
}
}
},
"vector": {
"type": "dense_vector",
"dims": dims,
"index": True,
"similarity": "cosine"
},
"timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
}
def get_text_mapping():
"""获取文本索引的mapping结构"""
return {
"properties": {
"raw_text": {
"type": "text",
"analyzer": "ik_smart",
"fielddata": True
},
"timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
}
def manage_index(action, index_type="vector", index_name=None, dims=200):
"""管理Elasticsearch索引
:param action: 'create''delete'
:param index_type: 'vector''text'
:param index_name: 索引名称(默认根据类型自动生成)
:param dims: 向量维度(仅向量索引有效)
"""
"""管理Elasticsearch索引"""
if index_name is None:
index_name = "knowledge_base" if index_type == "vector" else "raw_texts"
if action == "create":
if index_type == "vector":
return create_vector_index(index_name, dims)
else:
return create_text_index(index_name)
mapping = get_vector_mapping(dims) if index_type == "vector" else get_text_mapping()
try:
if es.indices.exists(index=index_name):
print(f"索引 {index_name} 已存在")
return False
es.indices.create(index=index_name, body={"mappings": mapping})
print(f"索引 {index_name} 创建成功(使用ik_smart分词器)")
return True
except Exception as e:
print(f"创建索引失败: {str(e)}")
raise
elif action == "delete":
if index_type == "vector":
return delete_index(index_name)
else:
return delete_text_index(index_name)
try:
if not es.indices.exists(index=index_name):
print(f"索引 {index_name} 不存在")
return False
es.indices.delete(index=index_name)
print(f"索引 {index_name} 删除成功")
return True
except Exception as e:
print(f"删除索引失败: {str(e)}")
raise
else:
raise ValueError("action参数必须是'create''delete'")
@ -45,51 +99,3 @@ if __name__ == "__main__":
# 创建新的原始文本索引
manage_index("create", "text")
# 修改knowledge_base索引的mapping
knowledge_base_mapping = {
"properties": {
# 在knowledge_base_mapping中添加
"content": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 8192 # 可以设置为1024/2048等更大值
}
}
},
# 在raw_texts_mapping中添加
"raw_text": {
"type": "text",
"analyzer": "ik_max_word",
"fielddata": True # 允许对长文本进行聚合
},
"vector": {
"type": "dense_vector",
"dims": 200,
"index": True,
"similarity": "cosine"
},
"timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
}
# 修改raw_texts索引的mapping
raw_texts_mapping = {
"properties": {
"raw_text": {
"type": "text",
"analyzer": "ik_max_word"
},
"timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
}
}
}

@ -1,13 +1,21 @@
import logging
from Util.EsMappingUtil import create_vector_index, create_text_index
from Util.EmbeddingUtil import text_to_embedding # 修改导入
from Config.Config import ES_CONFIG
from elasticsearch import Elasticsearch
import re
from tqdm import tqdm
import datetime
import numpy as np
import logging
# 在文件开头添加logger配置
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# 创建控制台handler并设置格式
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
def split_sentences(text):
"""按句分割文本"""
@ -18,30 +26,24 @@ def split_sentences(text):
def save_to_es(text):
"""保存向量化文本和原始文本到ES"""
vector = text_to_embedding(text) # 修改函数调用
vector = text_to_embedding(text)
# 检查向量是否有效
if vector is None or (hasattr(vector, 'size')) and vector.size == 0:
logging.warning(f"跳过无效向量文本: {text}")
return None
# 检查向量是否全为零或接近零
if np.all(np.abs(vector) < 1e-6):
logging.warning(f"跳过零向量文本: {text}")
return None
# 归一化向量以避免cosine相似度问题
norm = np.linalg.norm(vector)
if norm > 0:
vector = vector / norm
if vector is None:
logger.warning(f"跳过无法生成向量的文本: {text}")
return
doc = {
'text': text,
'vector': vector.tolist(),
'timestamp': datetime.datetime.now().isoformat()
'vector': vector,
'timestamp': datetime.datetime.now().isoformat(),
'analyzer': 'ik_smart'
}
es.index(index='knowledge_base', body=doc)
es.index(index='raw_texts', body={'text': text})
try:
es.index(index='knowledge_base', body=doc)
es.index(index='raw_texts', body={'raw_text': text})
except Exception as e:
logger.error(f"保存文本到ES失败: {e}")
def process_file(file_path):
"""处理文本文件"""
@ -64,8 +66,5 @@ if __name__ == '__main__':
ssl_show_warn=ES_CONFIG['ssl_show_warn']
)
create_vector_index()
create_text_index()
file_path = 'Txt/人口变化趋势对云南教育的影响.txt'
process_file(file_path)

@ -68,16 +68,14 @@ def process_query(query):
def search_related_data(query):
"""搜索与查询相关的数据"""
# 向量搜索
query_vector = text_to_embedding(query)
vector_results = es.search(
index=Config.ES_CONFIG['default_index'],
body={
"query": {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
"params": {"query_vector": query_vector}
"match": {
"content": {
"query": query,
"analyzer": "ik_smart" # 指定分词器
}
}
},

@ -1,94 +1,18 @@
D:\anaconda3\envs\rag\python.exe D:\dsWork\dsProject\dsRag\T8_RAG.py
D:\anaconda3\envs\rag\lib\site-packages\jieba\_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
import pkg_resources
2025-06-23 20:13:47,913 - INFO - loading projection weights from D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt
2025-06-23 20:13:49,382 - INFO - KeyedVectors lifecycle event {'msg': 'loaded (10000, 200) matrix of type float32 from D:\\Tencent_AILab_ChineseEmbedding\\Tencent_AILab_ChineseEmbedding.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-06-23T20:13:49.361378', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun 4 2025, 14:42:04) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'load_word2vec_format'}
2025-06-23 20:13:49,382 - INFO - 模型加载成功,词向量维度: 200
D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\__init__.py:311: SecurityWarning: Connecting to 'https://10.10.14.206:9200' using TLS with verify_certs=False is insecure
_transport = transport_class(
正在搜索与'整理云南省初中在校生情况文档'相关的数据...
Building prefix dict from the default dictionary ...
2025-06-23 20:13:50,520 - DEBUG - Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
2025-06-23 20:13:50,520 - DEBUG - Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.648 seconds.
2025-06-23 20:13:51,168 - DEBUG - Loading model cost 0.648 seconds.
Prefix dict has been built successfully.
2025-06-23 20:13:51,169 - DEBUG - Prefix dict has been built successfully.
2025-06-23 20:13:51,169 - INFO - 文本: 整理云南省初中在校生情况文档, 分词结果: ['整理', '云南省', '初中', '在校生', '情况', '文档']
2025-06-23 20:13:51,169 - INFO - 有效词向量数量: 3
2025-06-23 20:13:51,169 - INFO - 生成的平均向量: [ 0.18687634 -0.19447033 -0.04333766 -0.05068566 0.09433967]...
D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings
warnings.warn(
2025-06-23 20:13:51,222 - INFO - POST https://10.10.14.206:9200/knowledge_base/_search [status:200 duration:0.051s]
D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings
warnings.warn(
2025-06-23 20:13:51,231 - INFO - POST https://10.10.14.206:9200/raw_texts/_search [status:200 duration:0.009s]
找到10条相关数据
正在生成报告...
2025-06-23 20:13:56,582 - INFO - Retrying request to /chat/completions in 0.469314 seconds
2025-06-23 20:14:02,067 - INFO - Retrying request to /chat/completions in 0.762535 seconds
2025-06-23 21:08:46,327 - INFO - POST https://10.10.14.206:9200/knowledge_base/_doc [status:400 duration:0.004s]
处理进度: 8%|▊ | 40/503 [00:01<00:17, 26.02句/s]
Traceback (most recent call last):
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 101, in map_httpcore_exceptions
yield
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 250, in handle_request
resp = self._pool.handle_request(req)
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection_pool.py", line 256, in handle_request
raise exc from None
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection_pool.py", line 236, in handle_request
response = connection.handle_request(
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection.py", line 101, in handle_request
raise exc
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection.py", line 78, in handle_request
stream = self._connect(request)
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection.py", line 124, in _connect
stream = self._network_backend.connect_tcp(**kwargs)
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_backends\sync.py", line 207, in connect_tcp
with map_exceptions(exc_map):
File "D:\anaconda3\envs\rag\lib\contextlib.py", line 153, in __exit__
self.gen.throw(typ, value, traceback)
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_exceptions.py", line 14, in map_exceptions
raise to_exc(exc) from exc
httpcore.ConnectTimeout: timed out
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "D:\anaconda3\envs\rag\lib\site-packages\openai\_base_client.py", line 972, in request
response = self._client.send(
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 914, in send
response = self._send_handling_auth(
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 942, in _send_handling_auth
response = self._send_handling_redirects(
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 979, in _send_handling_redirects
response = self._send_single_request(request)
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 1014, in _send_single_request
response = transport.handle_request(request)
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 249, in handle_request
with map_httpcore_exceptions():
File "D:\anaconda3\envs\rag\lib\contextlib.py", line 153, in __exit__
self.gen.throw(typ, value, traceback)
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 118, in map_httpcore_exceptions
raise mapped_exc(message) from exc
httpx.ConnectTimeout: timed out
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "D:\dsWork\dsProject\dsRag\T8_RAG.py", line 98, in <module>
report = process_query(user_query)
File "D:\dsWork\dsProject\dsRag\T8_RAG.py", line 91, in process_query
report = generate_report(query, context)
File "D:\dsWork\dsProject\dsRag\T8_RAG.py", line 73, in generate_report
response = client.chat.completions.create(
File "D:\anaconda3\envs\rag\lib\site-packages\openai\_utils\_utils.py", line 287, in wrapper
return func(*args, **kwargs)
File "D:\anaconda3\envs\rag\lib\site-packages\openai\resources\chat\completions\completions.py", line 925, in create
return self._post(
File "D:\anaconda3\envs\rag\lib\site-packages\openai\_base_client.py", line 1249, in post
return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
File "D:\anaconda3\envs\rag\lib\site-packages\openai\_base_client.py", line 990, in request
raise APITimeoutError(request=request) from err
openai.APITimeoutError: Request timed out.
进程已结束,退出代码为 1
File "D:\dsWork\dsProject\dsRag\T2_ImportTxt.py", line 51, in <module>
process_file(file_path)
File "D:\dsWork\dsProject\dsRag\T2_ImportTxt.py", line 38, in process_file
save_to_es(sentence)
File "D:\dsWork\dsProject\dsRag\T2_ImportTxt.py", line 26, in save_to_es
es.index(index='knowledge_base', body=doc)
File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\utils.py", line 415, in wrapped
return api(*args, **kwargs)
File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\__init__.py", line 2951, in index
return self.perform_request( # type: ignore[return-value]
File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\_base.py", line 271, in perform_request
response = self._perform_request(
File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\_base.py", line 351, in _perform_request
raise HTTP_EXCEPTIONS.get(meta.status, ApiError)(
elasticsearch.BadRequestError: BadRequestError(400, 'document_parsing_exception', '[1:828] failed to parse: The [cosine] similarity does not support vectors with zero magnitude. Preview of invalid vector: [0.0, 0.0, 0.0, 0.0, 0.0, ...]', The [cosine] similarity does not support vectors with zero magnitude. Preview of invalid vector: [0.0, 0.0, 0.0, 0.0, 0.0, ...])

@ -15,6 +15,13 @@ def text_to_embedding(text):
words = jieba.lcut(text) # 使用 jieba 分词
logger.info(f"文本: {text}, 分词结果: {words}")
embeddings = [model[word] for word in words if word in model]
if not embeddings:
logger.warning(f"文本'{text}'无法生成有效向量")
return None # 或 raise ValueError("无法生成有效向量")
avg_embedding = sum(embeddings) / len(embeddings)
return avg_embedding
logger.info(f"有效词向量数量: {len(embeddings)}")
if embeddings:
avg_embedding = sum(embeddings) / len(embeddings)

@ -1,113 +0,0 @@
from elasticsearch import Elasticsearch
from Config.Config import ES_CONFIG
es = Elasticsearch(
hosts=ES_CONFIG["hosts"],
basic_auth=ES_CONFIG["basic_auth"],
verify_certs=ES_CONFIG["verify_certs"],
ssl_show_warn=ES_CONFIG["ssl_show_warn"]
)
def get_vector_mapping(dims=200):
"""获取向量索引的mapping结构"""
return {
"mappings": {
"properties": {
"text": {"type": "text", "analyzer": "ik_max_word"},
"vector": {
"type": "dense_vector",
"dims": dims,
"index": True,
"similarity": "cosine"
},
"timestamp": {"type": "date"}
}
}
}
def create_vector_index(index_name="knowledge_base", dims=200):
"""创建带有向量字段的索引"""
mapping = get_vector_mapping(dims)
try:
if es.indices.exists(index=index_name):
current_mapping = es.indices.get_mapping(index=index_name)
current_dims = current_mapping[index_name]["mappings"]["properties"]["vector"].get("dims", 0)
if current_dims == dims:
print(f"索引 {index_name} 已存在且维度正确({dims}维),无需操作")
return True
else:
print(f"警告:索引 {index_name} 已存在但维度不匹配(当前:{current_dims}维,需要:{dims}维)")
return False
es.indices.create(index=index_name, body=mapping)
print(f"索引 {index_name} 创建成功({dims}维)")
return True
except Exception as e:
print(f"操作索引失败: {str(e)}")
raise
def delete_index(index_name):
"""删除Elasticsearch索引"""
try:
if es.indices.exists(index=index_name):
es.indices.delete(index=index_name)
print(f"索引 {index_name} 删除成功")
return True
else:
print(f"索引 {index_name} 不存在")
return False
except Exception as e:
print(f"删除索引失败: {str(e)}")
raise
def create_text_index(index_name="raw_texts"):
"""创建原始文本索引"""
mapping = {
"mappings": {
"properties": {
"text": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"timestamp": {
"type": "date"
}
}
}
}
try:
if not es.indices.exists(index=index_name):
es.indices.create(index=index_name, body=mapping)
print(f"原始文本索引 {index_name} 创建成功")
return True
else:
print(f"原始文本索引 {index_name} 已存在")
return False
except Exception as e:
print(f"创建原始文本索引失败: {str(e)}")
raise
def delete_text_index(index_name="raw_texts"):
"""删除原始文本索引"""
try:
if es.indices.exists(index=index_name):
es.indices.delete(index=index_name)
print(f"原始文本索引 {index_name} 删除成功")
return True
else:
print(f"原始文本索引 {index_name} 不存在")
return False
except Exception as e:
print(f"删除原始文本索引失败: {str(e)}")
raise
Loading…
Cancel
Save