diff --git a/dsRag/T2_ImportTxt.py b/dsRag/T2_ImportTxt.py index 6b9a2d11..ecf1ba92 100644 --- a/dsRag/T2_ImportTxt.py +++ b/dsRag/T2_ImportTxt.py @@ -1,11 +1,11 @@ +import datetime +import logging -from Util.EmbeddingUtil import text_to_embedding # 修改导入 -from Config.Config import ES_CONFIG from elasticsearch import Elasticsearch -import re from tqdm import tqdm -import datetime -import logging + +from Config.Config import ES_CONFIG +from Util.EmbeddingUtil import text_to_embedding # 在文件开头添加logger配置 logger = logging.getLogger(__name__) @@ -17,13 +17,14 @@ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(messag handler.setFormatter(formatter) logger.addHandler(handler) -def split_sentences(text): - """按句分割文本""" +def split_paragraphs(text): + """按段落分割文本""" + # 按两个换行符分割段落 paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()] - # 使用jieba进行分句 - sentences = re.split(r'[。!?;\n]', text) # 添加这行定义sentences - return [s.strip() for s in sentences if s.strip()] + return paragraphs +# 修改process_file函数 +# Move save_to_es function definition before process_file def save_to_es(text): """保存向量化文本和原始文本到ES""" vector = text_to_embedding(text) @@ -45,18 +46,18 @@ def save_to_es(text): except Exception as e: logger.error(f"保存文本到ES失败: {e}") +# Then define process_file function def process_file(file_path): """处理文本文件""" with open(file_path, 'r', encoding='utf-8') as f: content = f.read() - sentences = split_sentences(content) + paragraphs = split_paragraphs(content) - # 添加进度条 - for sentence in tqdm(sentences, desc='处理进度', unit='句'): - save_to_es(sentence) + for paragraph in tqdm(paragraphs, desc='处理进度', unit='段'): + save_to_es(paragraph) - print(f"\n处理完成,共保存{len(sentences)}个句子") + print(f"\n处理完成,共保存{len(paragraphs)}个段落") if __name__ == '__main__': es = Elasticsearch( diff --git a/dsRag/T3_DeepSeekRag.py b/dsRag/T3_DeepSeekRag.py index e34c184d..ffe7b814 100644 --- a/dsRag/T3_DeepSeekRag.py +++ b/dsRag/T3_DeepSeekRag.py @@ -1,6 +1,6 @@ -''' +""" pip install openai -''' +""" from elasticsearch import Elasticsearch from openai import OpenAI from Config import Config