You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

68 lines
2.0 KiB

1 month ago
import datetime
import logging
1 month ago
from elasticsearch import Elasticsearch
from tqdm import tqdm
1 month ago
from Config.Config import ES_CONFIG
from Util.EmbeddingUtil import text_to_embedding
1 month ago
# 在文件开头添加logger配置
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# 创建控制台handler并设置格式
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
1 month ago
1 month ago
def split_paragraphs(text):
"""按段落分割文本"""
# 按两个换行符分割段落
1 month ago
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
1 month ago
return paragraphs
1 month ago
def save_to_es(text):
"""保存向量化文本和原始文本到ES"""
1 month ago
vector = text_to_embedding(text)
1 month ago
1 month ago
if vector is None:
logger.warning(f"跳过无法生成向量的文本: {text}")
return
1 month ago
doc = {
'text': text,
1 month ago
'vector': vector,
'timestamp': datetime.datetime.now().isoformat(),
'analyzer': 'ik_smart'
1 month ago
}
1 month ago
try:
es.index(index='knowledge_base', body=doc)
es.index(index='raw_texts', body={'raw_text': text})
except Exception as e:
logger.error(f"保存文本到ES失败: {e}")
1 month ago
def process_file(file_path):
"""处理文本文件"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
1 month ago
paragraphs = split_paragraphs(content)
1 month ago
1 month ago
for paragraph in tqdm(paragraphs, desc='处理进度', unit=''):
save_to_es(paragraph)
1 month ago
1 month ago
print(f"\n处理完成,共保存{len(paragraphs)}个段落")
1 month ago
if __name__ == '__main__':
es = Elasticsearch(
hosts=[ES_CONFIG['hosts']],
basic_auth=ES_CONFIG['basic_auth'],
verify_certs=ES_CONFIG['verify_certs'],
ssl_show_warn=ES_CONFIG['ssl_show_warn']
)
1 month ago
file_path = '../Txt/人口变化趋势对云南教育的影响.txt'
1 month ago
process_file(file_path)