main
HuangHai 1 month ago
parent ca984af410
commit 56e6db5e92

@ -1,7 +1,6 @@
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from Config.Config import ES_CONFIG from Config.Config import ES_CONFIG
from Util.EsMappingUtil import create_vector_index, delete_index, create_text_index, delete_text_index
# 初始化ES连接 # 初始化ES连接
es = Elasticsearch( es = Elasticsearch(
@ -11,65 +10,24 @@ es = Elasticsearch(
ssl_show_warn=ES_CONFIG["ssl_show_warn"] ssl_show_warn=ES_CONFIG["ssl_show_warn"]
) )
def manage_index(action, index_type="vector", index_name=None, dims=200): def get_vector_mapping(dims=200):
"""管理Elasticsearch索引 """获取向量索引的mapping结构"""
:param action: 'create''delete' return {
:param index_type: 'vector''text'
:param index_name: 索引名称(默认根据类型自动生成)
:param dims: 向量维度(仅向量索引有效)
"""
if index_name is None:
index_name = "knowledge_base" if index_type == "vector" else "raw_texts"
if action == "create":
if index_type == "vector":
return create_vector_index(index_name, dims)
else:
return create_text_index(index_name)
elif action == "delete":
if index_type == "vector":
return delete_index(index_name)
else:
return delete_text_index(index_name)
else:
raise ValueError("action参数必须是'create''delete'")
# 使用示例
if __name__ == "__main__":
# 先删除现有索引(如果存在)
manage_index("delete", "vector")
manage_index("delete", "text")
# 创建新的向量索引
manage_index("create", "vector", dims=200)
# 创建新的原始文本索引
manage_index("create", "text")
# 修改knowledge_base索引的mapping
knowledge_base_mapping = {
"properties": { "properties": {
# 在knowledge_base_mapping中添加
"content": { "content": {
"type": "text", "type": "text",
"analyzer": "ik_max_word", "analyzer": "ik_smart",
"search_analyzer": "ik_smart", "search_analyzer": "ik_smart",
"fields": { "fields": {
"keyword": { "keyword": {
"type": "keyword", "type": "keyword",
"ignore_above": 8192 # 可以设置为1024/2048等更大值 "ignore_above": 8192
} }
} }
}, },
# 在raw_texts_mapping中添加
"raw_text": {
"type": "text",
"analyzer": "ik_max_word",
"fielddata": True # 允许对长文本进行聚合
},
"vector": { "vector": {
"type": "dense_vector", "type": "dense_vector",
"dims": 200, "dims": dims,
"index": True, "index": True,
"similarity": "cosine" "similarity": "cosine"
}, },
@ -78,18 +36,66 @@ knowledge_base_mapping = {
"format": "strict_date_optional_time||epoch_millis" "format": "strict_date_optional_time||epoch_millis"
} }
} }
} }
# 修改raw_texts索引的mapping def get_text_mapping():
raw_texts_mapping = { """获取文本索引的mapping结构"""
return {
"properties": { "properties": {
"raw_text": { "raw_text": {
"type": "text", "type": "text",
"analyzer": "ik_max_word" "analyzer": "ik_smart",
"fielddata": True
}, },
"timestamp": { "timestamp": {
"type": "date", "type": "date",
"format": "strict_date_optional_time||epoch_millis" "format": "strict_date_optional_time||epoch_millis"
} }
} }
} }
def manage_index(action, index_type="vector", index_name=None, dims=200):
"""管理Elasticsearch索引"""
if index_name is None:
index_name = "knowledge_base" if index_type == "vector" else "raw_texts"
if action == "create":
mapping = get_vector_mapping(dims) if index_type == "vector" else get_text_mapping()
try:
if es.indices.exists(index=index_name):
print(f"索引 {index_name} 已存在")
return False
es.indices.create(index=index_name, body={"mappings": mapping})
print(f"索引 {index_name} 创建成功(使用ik_smart分词器)")
return True
except Exception as e:
print(f"创建索引失败: {str(e)}")
raise
elif action == "delete":
try:
if not es.indices.exists(index=index_name):
print(f"索引 {index_name} 不存在")
return False
es.indices.delete(index=index_name)
print(f"索引 {index_name} 删除成功")
return True
except Exception as e:
print(f"删除索引失败: {str(e)}")
raise
else:
raise ValueError("action参数必须是'create''delete'")
# 使用示例
if __name__ == "__main__":
# 先删除现有索引(如果存在)
manage_index("delete", "vector")
manage_index("delete", "text")
# 创建新的向量索引
manage_index("create", "vector", dims=200)
# 创建新的原始文本索引
manage_index("create", "text")

@ -1,13 +1,21 @@
import logging
from Util.EsMappingUtil import create_vector_index, create_text_index
from Util.EmbeddingUtil import text_to_embedding # 修改导入 from Util.EmbeddingUtil import text_to_embedding # 修改导入
from Config.Config import ES_CONFIG from Config.Config import ES_CONFIG
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
import re import re
from tqdm import tqdm from tqdm import tqdm
import datetime import datetime
import numpy as np import logging
# 在文件开头添加logger配置
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# 创建控制台handler并设置格式
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
def split_sentences(text): def split_sentences(text):
"""按句分割文本""" """按句分割文本"""
@ -18,30 +26,24 @@ def split_sentences(text):
def save_to_es(text): def save_to_es(text):
"""保存向量化文本和原始文本到ES""" """保存向量化文本和原始文本到ES"""
vector = text_to_embedding(text) # 修改函数调用 vector = text_to_embedding(text)
# 检查向量是否有效 if vector is None:
if vector is None or (hasattr(vector, 'size')) and vector.size == 0: logger.warning(f"跳过无法生成向量的文本: {text}")
logging.warning(f"跳过无效向量文本: {text}") return
return None
# 检查向量是否全为零或接近零
if np.all(np.abs(vector) < 1e-6):
logging.warning(f"跳过零向量文本: {text}")
return None
# 归一化向量以避免cosine相似度问题
norm = np.linalg.norm(vector)
if norm > 0:
vector = vector / norm
doc = { doc = {
'text': text, 'text': text,
'vector': vector.tolist(), 'vector': vector,
'timestamp': datetime.datetime.now().isoformat() 'timestamp': datetime.datetime.now().isoformat(),
'analyzer': 'ik_smart'
} }
try:
es.index(index='knowledge_base', body=doc) es.index(index='knowledge_base', body=doc)
es.index(index='raw_texts', body={'text': text}) es.index(index='raw_texts', body={'raw_text': text})
except Exception as e:
logger.error(f"保存文本到ES失败: {e}")
def process_file(file_path): def process_file(file_path):
"""处理文本文件""" """处理文本文件"""
@ -64,8 +66,5 @@ if __name__ == '__main__':
ssl_show_warn=ES_CONFIG['ssl_show_warn'] ssl_show_warn=ES_CONFIG['ssl_show_warn']
) )
create_vector_index()
create_text_index()
file_path = 'Txt/人口变化趋势对云南教育的影响.txt' file_path = 'Txt/人口变化趋势对云南教育的影响.txt'
process_file(file_path) process_file(file_path)

@ -68,16 +68,14 @@ def process_query(query):
def search_related_data(query): def search_related_data(query):
"""搜索与查询相关的数据""" """搜索与查询相关的数据"""
# 向量搜索 # 向量搜索
query_vector = text_to_embedding(query)
vector_results = es.search( vector_results = es.search(
index=Config.ES_CONFIG['default_index'], index=Config.ES_CONFIG['default_index'],
body={ body={
"query": { "query": {
"script_score": { "match": {
"query": {"match_all": {}}, "content": {
"script": { "query": query,
"source": "cosineSimilarity(params.query_vector, 'vector') + 1.0", "analyzer": "ik_smart" # 指定分词器
"params": {"query_vector": query_vector}
} }
} }
}, },

@ -1,94 +1,18 @@
D:\anaconda3\envs\rag\python.exe D:\dsWork\dsProject\dsRag\T8_RAG.py 2025-06-23 21:08:46,327 - INFO - POST https://10.10.14.206:9200/knowledge_base/_doc [status:400 duration:0.004s]
D:\anaconda3\envs\rag\lib\site-packages\jieba\_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. 处理进度: 8%|▊ | 40/503 [00:01<00:17, 26.02句/s]
import pkg_resources
2025-06-23 20:13:47,913 - INFO - loading projection weights from D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt
2025-06-23 20:13:49,382 - INFO - KeyedVectors lifecycle event {'msg': 'loaded (10000, 200) matrix of type float32 from D:\\Tencent_AILab_ChineseEmbedding\\Tencent_AILab_ChineseEmbedding.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-06-23T20:13:49.361378', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun 4 2025, 14:42:04) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'load_word2vec_format'}
2025-06-23 20:13:49,382 - INFO - 模型加载成功,词向量维度: 200
D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\__init__.py:311: SecurityWarning: Connecting to 'https://10.10.14.206:9200' using TLS with verify_certs=False is insecure
_transport = transport_class(
正在搜索与'整理云南省初中在校生情况文档'相关的数据...
Building prefix dict from the default dictionary ...
2025-06-23 20:13:50,520 - DEBUG - Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
2025-06-23 20:13:50,520 - DEBUG - Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.648 seconds.
2025-06-23 20:13:51,168 - DEBUG - Loading model cost 0.648 seconds.
Prefix dict has been built successfully.
2025-06-23 20:13:51,169 - DEBUG - Prefix dict has been built successfully.
2025-06-23 20:13:51,169 - INFO - 文本: 整理云南省初中在校生情况文档, 分词结果: ['整理', '云南省', '初中', '在校生', '情况', '文档']
2025-06-23 20:13:51,169 - INFO - 有效词向量数量: 3
2025-06-23 20:13:51,169 - INFO - 生成的平均向量: [ 0.18687634 -0.19447033 -0.04333766 -0.05068566 0.09433967]...
D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings
warnings.warn(
2025-06-23 20:13:51,222 - INFO - POST https://10.10.14.206:9200/knowledge_base/_search [status:200 duration:0.051s]
D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings
warnings.warn(
2025-06-23 20:13:51,231 - INFO - POST https://10.10.14.206:9200/raw_texts/_search [status:200 duration:0.009s]
找到10条相关数据
正在生成报告...
2025-06-23 20:13:56,582 - INFO - Retrying request to /chat/completions in 0.469314 seconds
2025-06-23 20:14:02,067 - INFO - Retrying request to /chat/completions in 0.762535 seconds
Traceback (most recent call last): Traceback (most recent call last):
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 101, in map_httpcore_exceptions File "D:\dsWork\dsProject\dsRag\T2_ImportTxt.py", line 51, in <module>
yield process_file(file_path)
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 250, in handle_request File "D:\dsWork\dsProject\dsRag\T2_ImportTxt.py", line 38, in process_file
resp = self._pool.handle_request(req) save_to_es(sentence)
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection_pool.py", line 256, in handle_request File "D:\dsWork\dsProject\dsRag\T2_ImportTxt.py", line 26, in save_to_es
raise exc from None es.index(index='knowledge_base', body=doc)
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection_pool.py", line 236, in handle_request File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\utils.py", line 415, in wrapped
response = connection.handle_request( return api(*args, **kwargs)
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection.py", line 101, in handle_request File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\__init__.py", line 2951, in index
raise exc return self.perform_request( # type: ignore[return-value]
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection.py", line 78, in handle_request File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\_base.py", line 271, in perform_request
stream = self._connect(request) response = self._perform_request(
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_sync\connection.py", line 124, in _connect File "D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\_base.py", line 351, in _perform_request
stream = self._network_backend.connect_tcp(**kwargs) raise HTTP_EXCEPTIONS.get(meta.status, ApiError)(
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_backends\sync.py", line 207, in connect_tcp elasticsearch.BadRequestError: BadRequestError(400, 'document_parsing_exception', '[1:828] failed to parse: The [cosine] similarity does not support vectors with zero magnitude. Preview of invalid vector: [0.0, 0.0, 0.0, 0.0, 0.0, ...]', The [cosine] similarity does not support vectors with zero magnitude. Preview of invalid vector: [0.0, 0.0, 0.0, 0.0, 0.0, ...])
with map_exceptions(exc_map):
File "D:\anaconda3\envs\rag\lib\contextlib.py", line 153, in __exit__
self.gen.throw(typ, value, traceback)
File "D:\anaconda3\envs\rag\lib\site-packages\httpcore\_exceptions.py", line 14, in map_exceptions
raise to_exc(exc) from exc
httpcore.ConnectTimeout: timed out
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "D:\anaconda3\envs\rag\lib\site-packages\openai\_base_client.py", line 972, in request
response = self._client.send(
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 914, in send
response = self._send_handling_auth(
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 942, in _send_handling_auth
response = self._send_handling_redirects(
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 979, in _send_handling_redirects
response = self._send_single_request(request)
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_client.py", line 1014, in _send_single_request
response = transport.handle_request(request)
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 249, in handle_request
with map_httpcore_exceptions():
File "D:\anaconda3\envs\rag\lib\contextlib.py", line 153, in __exit__
self.gen.throw(typ, value, traceback)
File "D:\anaconda3\envs\rag\lib\site-packages\httpx\_transports\default.py", line 118, in map_httpcore_exceptions
raise mapped_exc(message) from exc
httpx.ConnectTimeout: timed out
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "D:\dsWork\dsProject\dsRag\T8_RAG.py", line 98, in <module>
report = process_query(user_query)
File "D:\dsWork\dsProject\dsRag\T8_RAG.py", line 91, in process_query
report = generate_report(query, context)
File "D:\dsWork\dsProject\dsRag\T8_RAG.py", line 73, in generate_report
response = client.chat.completions.create(
File "D:\anaconda3\envs\rag\lib\site-packages\openai\_utils\_utils.py", line 287, in wrapper
return func(*args, **kwargs)
File "D:\anaconda3\envs\rag\lib\site-packages\openai\resources\chat\completions\completions.py", line 925, in create
return self._post(
File "D:\anaconda3\envs\rag\lib\site-packages\openai\_base_client.py", line 1249, in post
return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
File "D:\anaconda3\envs\rag\lib\site-packages\openai\_base_client.py", line 990, in request
raise APITimeoutError(request=request) from err
openai.APITimeoutError: Request timed out.
进程已结束,退出代码为 1

@ -15,6 +15,13 @@ def text_to_embedding(text):
words = jieba.lcut(text) # 使用 jieba 分词 words = jieba.lcut(text) # 使用 jieba 分词
logger.info(f"文本: {text}, 分词结果: {words}") logger.info(f"文本: {text}, 分词结果: {words}")
embeddings = [model[word] for word in words if word in model] embeddings = [model[word] for word in words if word in model]
if not embeddings:
logger.warning(f"文本'{text}'无法生成有效向量")
return None # 或 raise ValueError("无法生成有效向量")
avg_embedding = sum(embeddings) / len(embeddings)
return avg_embedding
logger.info(f"有效词向量数量: {len(embeddings)}") logger.info(f"有效词向量数量: {len(embeddings)}")
if embeddings: if embeddings:
avg_embedding = sum(embeddings) / len(embeddings) avg_embedding = sum(embeddings) / len(embeddings)

@ -1,113 +0,0 @@
from elasticsearch import Elasticsearch
from Config.Config import ES_CONFIG
es = Elasticsearch(
hosts=ES_CONFIG["hosts"],
basic_auth=ES_CONFIG["basic_auth"],
verify_certs=ES_CONFIG["verify_certs"],
ssl_show_warn=ES_CONFIG["ssl_show_warn"]
)
def get_vector_mapping(dims=200):
"""获取向量索引的mapping结构"""
return {
"mappings": {
"properties": {
"text": {"type": "text", "analyzer": "ik_max_word"},
"vector": {
"type": "dense_vector",
"dims": dims,
"index": True,
"similarity": "cosine"
},
"timestamp": {"type": "date"}
}
}
}
def create_vector_index(index_name="knowledge_base", dims=200):
"""创建带有向量字段的索引"""
mapping = get_vector_mapping(dims)
try:
if es.indices.exists(index=index_name):
current_mapping = es.indices.get_mapping(index=index_name)
current_dims = current_mapping[index_name]["mappings"]["properties"]["vector"].get("dims", 0)
if current_dims == dims:
print(f"索引 {index_name} 已存在且维度正确({dims}维),无需操作")
return True
else:
print(f"警告:索引 {index_name} 已存在但维度不匹配(当前:{current_dims}维,需要:{dims}维)")
return False
es.indices.create(index=index_name, body=mapping)
print(f"索引 {index_name} 创建成功({dims}维)")
return True
except Exception as e:
print(f"操作索引失败: {str(e)}")
raise
def delete_index(index_name):
"""删除Elasticsearch索引"""
try:
if es.indices.exists(index=index_name):
es.indices.delete(index=index_name)
print(f"索引 {index_name} 删除成功")
return True
else:
print(f"索引 {index_name} 不存在")
return False
except Exception as e:
print(f"删除索引失败: {str(e)}")
raise
def create_text_index(index_name="raw_texts"):
"""创建原始文本索引"""
mapping = {
"mappings": {
"properties": {
"text": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"timestamp": {
"type": "date"
}
}
}
}
try:
if not es.indices.exists(index=index_name):
es.indices.create(index=index_name, body=mapping)
print(f"原始文本索引 {index_name} 创建成功")
return True
else:
print(f"原始文本索引 {index_name} 已存在")
return False
except Exception as e:
print(f"创建原始文本索引失败: {str(e)}")
raise
def delete_text_index(index_name="raw_texts"):
"""删除原始文本索引"""
try:
if es.indices.exists(index=index_name):
es.indices.delete(index=index_name)
print(f"原始文本索引 {index_name} 删除成功")
return True
else:
print(f"原始文本索引 {index_name} 不存在")
return False
except Exception as e:
print(f"删除原始文本索引失败: {str(e)}")
raise
Loading…
Cancel
Save