parent
c024386037
commit
7ff869e4a5
@ -1,100 +0,0 @@
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
from Config.Config import ES_CONFIG
|
||||
|
||||
# 初始化ES连接
|
||||
es = Elasticsearch(
|
||||
hosts=ES_CONFIG["hosts"],
|
||||
basic_auth=ES_CONFIG["basic_auth"],
|
||||
verify_certs=ES_CONFIG["verify_certs"],
|
||||
ssl_show_warn=ES_CONFIG["ssl_show_warn"]
|
||||
)
|
||||
|
||||
def get_vector_mapping(dims=200):
|
||||
"""获取向量索引的mapping结构"""
|
||||
return {
|
||||
"properties": {
|
||||
"content": {
|
||||
"type": "text",
|
||||
"analyzer": "ik_smart",
|
||||
"search_analyzer": "ik_smart",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 8192
|
||||
}
|
||||
}
|
||||
},
|
||||
"vector": {
|
||||
"type": "dense_vector",
|
||||
"dims": dims,
|
||||
"index": True,
|
||||
"similarity": "cosine"
|
||||
},
|
||||
"timestamp": {
|
||||
"type": "date",
|
||||
"format": "strict_date_optional_time||epoch_millis"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def get_text_mapping():
|
||||
"""获取文本索引的mapping结构"""
|
||||
return {
|
||||
"properties": {
|
||||
"raw_text": {
|
||||
"type": "text",
|
||||
"analyzer": "ik_smart",
|
||||
"fielddata": True
|
||||
},
|
||||
"timestamp": {
|
||||
"type": "date",
|
||||
"format": "strict_date_optional_time||epoch_millis"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def manage_index(action, index_type="vector", index_name=None, dims=200):
|
||||
"""管理Elasticsearch索引"""
|
||||
if index_name is None:
|
||||
index_name = "knowledge_base" if index_type == "vector" else "raw_texts"
|
||||
|
||||
if action == "create":
|
||||
mapping = get_vector_mapping(dims) if index_type == "vector" else get_text_mapping()
|
||||
|
||||
try:
|
||||
if es.indices.exists(index=index_name):
|
||||
print(f"索引 {index_name} 已存在")
|
||||
return False
|
||||
|
||||
es.indices.create(index=index_name, body={"mappings": mapping})
|
||||
print(f"索引 {index_name} 创建成功(使用ik_smart分词器)")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"创建索引失败: {str(e)}")
|
||||
raise
|
||||
elif action == "delete":
|
||||
try:
|
||||
if not es.indices.exists(index=index_name):
|
||||
print(f"索引 {index_name} 不存在")
|
||||
return False
|
||||
|
||||
es.indices.delete(index=index_name)
|
||||
print(f"索引 {index_name} 删除成功")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"删除索引失败: {str(e)}")
|
||||
raise
|
||||
else:
|
||||
raise ValueError("action参数必须是'create'或'delete'")
|
||||
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
# 先删除现有索引(如果存在)
|
||||
manage_index("delete", "vector")
|
||||
manage_index("delete", "text")
|
||||
|
||||
# 创建新的向量索引
|
||||
manage_index("create", "vector", dims=200)
|
||||
# 创建新的原始文本索引
|
||||
manage_index("create", "text")
|
@ -1,82 +0,0 @@
|
||||
import os
|
||||
import datetime
|
||||
import logging
|
||||
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
from Config.Config import ES_CONFIG
|
||||
from Util.EmbeddingUtil import text_to_embedding
|
||||
|
||||
# 在文件开头添加logger配置
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# 创建控制台handler并设置格式
|
||||
handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
def split_paragraphs(text):
|
||||
"""按段落分割文本"""
|
||||
# 按两个换行符分割段落
|
||||
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
||||
return paragraphs
|
||||
|
||||
def save_to_es(text):
|
||||
"""保存向量化文本和原始文本到ES"""
|
||||
vector = text_to_embedding(text)
|
||||
|
||||
if vector is None:
|
||||
logger.warning(f"跳过无法生成向量的文本: {text}")
|
||||
return
|
||||
|
||||
doc = {
|
||||
'text': text,
|
||||
'vector': vector,
|
||||
'timestamp': datetime.datetime.now().isoformat(),
|
||||
'analyzer': 'ik_smart'
|
||||
}
|
||||
|
||||
try:
|
||||
es.index(index='knowledge_base', body=doc)
|
||||
es.index(index='raw_texts', body={'raw_text': text})
|
||||
except Exception as e:
|
||||
logger.error(f"保存文本到ES失败: {e}")
|
||||
|
||||
def process_directory(dir_path):
|
||||
"""处理目录下所有文本文件"""
|
||||
total_paragraphs = 0
|
||||
|
||||
# 获取所有txt文件并按数字排序
|
||||
files = [f for f in os.listdir(dir_path) if f.endswith('.txt')]
|
||||
files.sort(key=lambda x: int(x.split('.')[0]))
|
||||
file_count = len(files)
|
||||
|
||||
print(f"共发现{file_count}个文本文件需要处理")
|
||||
|
||||
for i, filename in enumerate(files, 1):
|
||||
print(f"正在处理第{i}/{file_count}个文件: {filename}")
|
||||
|
||||
file_path = os.path.join(dir_path, filename)
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
paragraphs = split_paragraphs(content)
|
||||
total_paragraphs += len(paragraphs)
|
||||
|
||||
for paragraph in paragraphs:
|
||||
save_to_es(paragraph)
|
||||
|
||||
print(f"\n处理完成,共处理{file_count}个文件,保存{total_paragraphs}个段落")
|
||||
|
||||
if __name__ == '__main__':
|
||||
es = Elasticsearch(
|
||||
hosts=[ES_CONFIG['hosts']],
|
||||
basic_auth=ES_CONFIG['basic_auth'],
|
||||
verify_certs=ES_CONFIG['verify_certs'],
|
||||
ssl_show_warn=ES_CONFIG['ssl_show_warn']
|
||||
)
|
||||
|
||||
dir_path = '../Txt/processed_chunks' # T2_DocxProcessor.py生成的目录
|
||||
process_directory(dir_path)
|
Loading…
Reference in new issue