import re import os from Util.SplitDocxUtil import SplitDocxUtil def split_into_blocks(text): """使用正则表达式匹配问题和话题的标题及内容""" pattern = r'(问题\d+|话题\d+)([\s\S]+?)(?=问题\d+|话题\d+|$)' blocks = re.findall(pattern, text, re.DOTALL) return [(i+1, title + content) for i, (title, content) in enumerate(blocks)] def process_document(input_path, output_dir): """处理文档主函数""" text = SplitDocxUtil.read_docx(input_path) if not text: print("无法读取输入文件内容") return False # 确保输出目录存在并清空目录 if os.path.exists(output_dir): for file in os.listdir(output_dir): os.remove(os.path.join(output_dir, file)) os.makedirs(output_dir, exist_ok=True) chunks = split_into_blocks(text) print(f"共分割出{len(chunks)}个段落块") saved_count = 0 for chunk_num, chunk in chunks: chunk = chunk.strip() # 确保去除空白字符 output_file = os.path.join(output_dir, f"{chunk_num}.txt") if save_to_txt(chunk, output_file, mode='w'): saved_count += 1 print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}") return saved_count > 0 # 保留原有的save_to_txt函数 def save_to_txt(content, file_path, mode='w'): """将内容保存到文本文件""" try: with open(file_path, mode, encoding='utf-8') as f: f.write(content) return True except Exception as e: print(f"保存文件{file_path}时出错: {str(e)}") return False if __name__ == "__main__": input_file = '../Txt/小学数学(史校长).docx' output_dir = '../Txt/processed_chunks' process_document(input_file, output_dir)