main
HuangHai 4 weeks ago
parent 61f9571ed2
commit bebee721d3

@ -1,11 +1,11 @@
import datetime
import logging
from Util.EmbeddingUtil import text_to_embedding # 修改导入
from Config.Config import ES_CONFIG
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
import re
from tqdm import tqdm from tqdm import tqdm
import datetime
import logging from Config.Config import ES_CONFIG
from Util.EmbeddingUtil import text_to_embedding
# 在文件开头添加logger配置 # 在文件开头添加logger配置
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -17,13 +17,14 @@ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(messag
handler.setFormatter(formatter) handler.setFormatter(formatter)
logger.addHandler(handler) logger.addHandler(handler)
def split_sentences(text): def split_paragraphs(text):
"""按句分割文本""" """按段落分割文本"""
# 按两个换行符分割段落
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()] paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
# 使用jieba进行分句 return paragraphs
sentences = re.split(r'[。!?;\n]', text) # 添加这行定义sentences
return [s.strip() for s in sentences if s.strip()]
# 修改process_file函数
# Move save_to_es function definition before process_file
def save_to_es(text): def save_to_es(text):
"""保存向量化文本和原始文本到ES""" """保存向量化文本和原始文本到ES"""
vector = text_to_embedding(text) vector = text_to_embedding(text)
@ -45,18 +46,18 @@ def save_to_es(text):
except Exception as e: except Exception as e:
logger.error(f"保存文本到ES失败: {e}") logger.error(f"保存文本到ES失败: {e}")
# Then define process_file function
def process_file(file_path): def process_file(file_path):
"""处理文本文件""" """处理文本文件"""
with open(file_path, 'r', encoding='utf-8') as f: with open(file_path, 'r', encoding='utf-8') as f:
content = f.read() content = f.read()
sentences = split_sentences(content) paragraphs = split_paragraphs(content)
# 添加进度条 for paragraph in tqdm(paragraphs, desc='处理进度', unit=''):
for sentence in tqdm(sentences, desc='处理进度', unit=''): save_to_es(paragraph)
save_to_es(sentence)
print(f"\n处理完成,共保存{len(sentences)}个句子") print(f"\n处理完成,共保存{len(paragraphs)}个段落")
if __name__ == '__main__': if __name__ == '__main__':
es = Elasticsearch( es = Elasticsearch(

@ -1,6 +1,6 @@
''' """
pip install openai pip install openai
''' """
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from openai import OpenAI from openai import OpenAI
from Config import Config from Config import Config

Loading…
Cancel
Save