main
HuangHai 4 weeks ago
parent f720982ddc
commit ad732b1117

@ -43,9 +43,12 @@ def process_document(input_path, output_dir):
print(f"共分割出{len(chunks)}个段落块") print(f"共分割出{len(chunks)}个段落块")
saved_count = 0 saved_count = 0
# 从输入文件名中提取MATH_1部分
file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + os.path.basename(input_path).split('.')[0].split('_')[-1]
for chunk_num, chunk in chunks: for chunk_num, chunk in chunks:
chunk = chunk.strip() # 确保去除空白字符 chunk = chunk.strip() # 确保去除空白字符
output_file = os.path.join(output_dir, f"{chunk_num}.txt") output_file = os.path.join(output_dir, f"{file_prefix}_{chunk_num}.txt")
if save_to_txt(chunk, output_file, mode='w'): if save_to_txt(chunk, output_file, mode='w'):
saved_count += 1 saved_count += 1
@ -92,5 +95,5 @@ def process_directory(input_dir, output_dir):
if __name__ == "__main__": if __name__ == "__main__":
input_dir = '../static/Txt' input_dir = '../static/Txt'
output_dir = '../Txt/processed_chunks' output_dir = '../Txt'
process_directory(input_dir, output_dir) process_directory(input_dir, output_dir)

@ -64,8 +64,8 @@ def save_to_txt(content, file_path, mode='w'):
return False return False
if __name__ == "__main__": if __name__ == "__main__":
input_file = '../static/Txt/小学数学教学中的若干问题.docx' input_file = '../static/Txt/小学数学教学中的若干问题_MATH_1.docx'
#input_file = '../static/Txt/小学数学知识点.docx' #input_file = '../static/Txt/小学数学知识点_MATH_2.docx'
#input_file = '../static/Txt/高中文言文.docx' #input_file = '../static/Txt/高中文言文_CHINESE_1.docx'
output_dir = '../Txt/processed_chunks' output_dir = '../Txt/processed_chunks'
process_document(input_file, output_dir) process_document(input_file, output_dir)

@ -0,0 +1,2 @@
数量是什么?数量关系的本质是什么?
数量是对现实生活中事物量的抽象 / 数量关系的本质是多与少

@ -0,0 +1,3 @@
如何认识自然数?
数是对数量的抽象 / 数关系是对数量关系的抽象:大与小 / 可以有两种方法实现这种抽
象:对应的方法和定义的方法

@ -0,0 +1,2 @@
表示自然数的关键是什么?
十个符号和数位 / 数位法则是依次相差十倍 / 自然数集合

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save