main
HuangHai 4 weeks ago
parent 61f9571ed2
commit bebee721d3

@ -1,11 +1,11 @@
import datetime
import logging
from Util.EmbeddingUtil import text_to_embedding # 修改导入
from Config.Config import ES_CONFIG
from elasticsearch import Elasticsearch
import re
from tqdm import tqdm
import datetime
import logging
from Config.Config import ES_CONFIG
from Util.EmbeddingUtil import text_to_embedding
# 在文件开头添加logger配置
logger = logging.getLogger(__name__)
@ -17,13 +17,14 @@ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(messag
handler.setFormatter(formatter)
logger.addHandler(handler)
def split_sentences(text):
"""按句分割文本"""
def split_paragraphs(text):
"""按段落分割文本"""
# 按两个换行符分割段落
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
# 使用jieba进行分句
sentences = re.split(r'[。!?;\n]', text) # 添加这行定义sentences
return [s.strip() for s in sentences if s.strip()]
return paragraphs
# 修改process_file函数
# Move save_to_es function definition before process_file
def save_to_es(text):
"""保存向量化文本和原始文本到ES"""
vector = text_to_embedding(text)
@ -45,18 +46,18 @@ def save_to_es(text):
except Exception as e:
logger.error(f"保存文本到ES失败: {e}")
# Then define process_file function
def process_file(file_path):
"""处理文本文件"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
sentences = split_sentences(content)
paragraphs = split_paragraphs(content)
# 添加进度条
for sentence in tqdm(sentences, desc='处理进度', unit=''):
save_to_es(sentence)
for paragraph in tqdm(paragraphs, desc='处理进度', unit=''):
save_to_es(paragraph)
print(f"\n处理完成,共保存{len(sentences)}个句子")
print(f"\n处理完成,共保存{len(paragraphs)}个段落")
if __name__ == '__main__':
es = Elasticsearch(

@ -1,6 +1,6 @@
'''
"""
pip install openai
'''
"""
from elasticsearch import Elasticsearch
from openai import OpenAI
from Config import Config

Loading…
Cancel
Save