|
|
|
@ -47,25 +47,26 @@ def save_to_es(text):
|
|
|
|
|
def process_directory(dir_path):
|
|
|
|
|
"""处理目录下所有文本文件"""
|
|
|
|
|
total_paragraphs = 0
|
|
|
|
|
file_count = len([f for f in os.listdir(dir_path) if f.endswith('.txt')])
|
|
|
|
|
current_file = 0
|
|
|
|
|
|
|
|
|
|
# 获取所有txt文件并按数字排序
|
|
|
|
|
files = [f for f in os.listdir(dir_path) if f.endswith('.txt')]
|
|
|
|
|
files.sort(key=lambda x: int(x.split('.')[0]))
|
|
|
|
|
file_count = len(files)
|
|
|
|
|
|
|
|
|
|
print(f"共发现{file_count}个文本文件需要处理")
|
|
|
|
|
|
|
|
|
|
for filename in os.listdir(dir_path):
|
|
|
|
|
if filename.endswith('.txt'):
|
|
|
|
|
current_file += 1
|
|
|
|
|
print(f"正在处理第{current_file}/{file_count}个文件: {filename}")
|
|
|
|
|
|
|
|
|
|
file_path = os.path.join(dir_path, filename)
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
content = f.read()
|
|
|
|
|
|
|
|
|
|
paragraphs = split_paragraphs(content)
|
|
|
|
|
total_paragraphs += len(paragraphs)
|
|
|
|
|
|
|
|
|
|
for paragraph in paragraphs:
|
|
|
|
|
save_to_es(paragraph)
|
|
|
|
|
for i, filename in enumerate(files, 1):
|
|
|
|
|
print(f"正在处理第{i}/{file_count}个文件: {filename}")
|
|
|
|
|
|
|
|
|
|
file_path = os.path.join(dir_path, filename)
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
content = f.read()
|
|
|
|
|
|
|
|
|
|
paragraphs = split_paragraphs(content)
|
|
|
|
|
total_paragraphs += len(paragraphs)
|
|
|
|
|
|
|
|
|
|
for paragraph in paragraphs:
|
|
|
|
|
save_to_es(paragraph)
|
|
|
|
|
|
|
|
|
|
print(f"\n处理完成,共处理{file_count}个文件,保存{total_paragraphs}个段落")
|
|
|
|
|
|
|
|
|
|