You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
40 lines
1.2 KiB
40 lines
1.2 KiB
import os
|
|
|
|
from Util.SplitDocxUtil import SplitDocxUtil
|
|
|
|
|
|
def process_document(input_path, output_dir):
|
|
"""处理文档主函数"""
|
|
text = SplitDocxUtil.read_docx(input_path)
|
|
if not text:
|
|
print("无法读取输入文件内容")
|
|
return False
|
|
|
|
# 确保输出目录存在
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
chunks = SplitDocxUtil.split_text(text)
|
|
print(f"共分割出{len(chunks)}个段落块")
|
|
|
|
for chunk_num, chunk in chunks:
|
|
output_file = os.path.join(output_dir, f"{chunk_num}.txt")
|
|
save_to_txt(chunk, output_file, mode='w')
|
|
|
|
print(f"处理完成,结果已保存到目录: {output_dir}")
|
|
return True
|
|
|
|
def save_to_txt(content, file_path, mode='w'):
|
|
"""将内容保存到文本文件"""
|
|
try:
|
|
with open(file_path, mode, encoding='utf-8') as f:
|
|
f.write(content)
|
|
return True
|
|
except Exception as e:
|
|
print(f"保存文件{file_path}时出错: {str(e)}")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
input_file = '../Txt/小学数学(史校长).docx'
|
|
output_dir = '../Txt/processed_chunks'
|
|
process_document(input_file, output_dir)
|