From ec9cbbc4f7ada2bb8dd82c8f1d9fdaa5e24b69ed Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Thu, 26 Jun 2025 08:08:09 +0800 Subject: [PATCH] 'commit' --- .../{T2_DocxProcessor.py => T2_SplitTxt.py} | 0 dsRag/Tools/T3_ImportTxt.py | 33 ++++++++++--------- 2 files changed, 17 insertions(+), 16 deletions(-) rename dsRag/Tools/{T2_DocxProcessor.py => T2_SplitTxt.py} (100%) diff --git a/dsRag/Tools/T2_DocxProcessor.py b/dsRag/Tools/T2_SplitTxt.py similarity index 100% rename from dsRag/Tools/T2_DocxProcessor.py rename to dsRag/Tools/T2_SplitTxt.py diff --git a/dsRag/Tools/T3_ImportTxt.py b/dsRag/Tools/T3_ImportTxt.py index 54b07a07..3947bcce 100644 --- a/dsRag/Tools/T3_ImportTxt.py +++ b/dsRag/Tools/T3_ImportTxt.py @@ -47,25 +47,26 @@ def save_to_es(text): def process_directory(dir_path): """处理目录下所有文本文件""" total_paragraphs = 0 - file_count = len([f for f in os.listdir(dir_path) if f.endswith('.txt')]) - current_file = 0 + + # 获取所有txt文件并按数字排序 + files = [f for f in os.listdir(dir_path) if f.endswith('.txt')] + files.sort(key=lambda x: int(x.split('.')[0])) + file_count = len(files) print(f"共发现{file_count}个文本文件需要处理") - for filename in os.listdir(dir_path): - if filename.endswith('.txt'): - current_file += 1 - print(f"正在处理第{current_file}/{file_count}个文件: {filename}") - - file_path = os.path.join(dir_path, filename) - with open(file_path, 'r', encoding='utf-8') as f: - content = f.read() - - paragraphs = split_paragraphs(content) - total_paragraphs += len(paragraphs) - - for paragraph in paragraphs: - save_to_es(paragraph) + for i, filename in enumerate(files, 1): + print(f"正在处理第{i}/{file_count}个文件: {filename}") + + file_path = os.path.join(dir_path, filename) + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + paragraphs = split_paragraphs(content) + total_paragraphs += len(paragraphs) + + for paragraph in paragraphs: + save_to_es(paragraph) print(f"\n处理完成,共处理{file_count}个文件,保存{total_paragraphs}个段落")