import os from Util.SplitDocxUtil import SplitDocxUtil def split_into_blocks(text): """按行遍历文本,发现'问题X'或'话题X'时开始分割,只移除前缀但保留整行内容""" blocks = [] current_block = [] in_block = False for line in text.splitlines(): if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]): if in_block: blocks.append('\n'.join(current_block)) current_block = [] in_block = True # 循环移除问题和话题前缀后的数字 while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())): if line.startswith(('问题', '话题')): line = line[2:] if len(line) > 2 else line elif line and line[0].isdigit(): line = line[1:] if len(line) > 1 else line line = line.strip() if in_block and line: # 只添加非空行 current_block.append(line) if current_block: blocks.append('\n'.join(current_block)) return [(i+1, block) for i, block in enumerate(blocks)] def process_document(input_path, output_dir): """处理文档主函数""" text = SplitDocxUtil.read_docx(input_path) if not text: print("无法读取输入文件内容") return False # 清空目录操作已移到process_directory函数中 chunks = split_into_blocks(text) print(f"共分割出{len(chunks)}个段落块") saved_count = 0 # 从输入文件名中提取MATH_1部分 file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + os.path.basename(input_path).split('.')[0].split('_')[-1] for chunk_num, chunk in chunks: chunk = chunk.strip() # 确保去除空白字符 output_file = os.path.join(output_dir, f"{file_prefix}_{chunk_num}.txt") if save_to_txt(chunk, output_file, mode='w'): saved_count += 1 print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}") return saved_count > 0 # 保留原有的save_to_txt函数 def save_to_txt(content, file_path, mode='w'): """将内容保存到文本文件""" try: with open(file_path, mode, encoding='utf-8') as f: f.write(content) return True except Exception as e: print(f"保存文件{file_path}时出错: {str(e)}") return False def process_directory(input_dir, output_dir): """处理目录下所有docx文件""" if not os.path.exists(input_dir): print(f"输入目录不存在: {input_dir}") return False # 确保输出目录存在并清空目录(只需一次) if os.path.exists(output_dir): for file in os.listdir(output_dir): os.remove(os.path.join(output_dir, file)) os.makedirs(output_dir, exist_ok=True) docx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.docx')] if not docx_files: print(f"目录中没有找到docx文件: {input_dir}") return False success_count = 0 for docx_file in docx_files: input_path = os.path.join(input_dir, docx_file) print(f"正在处理文件: {docx_file}") if process_document(input_path, output_dir): success_count += 1 print(f"处理完成,共处理{success_count}/{len(docx_files)}个文件") return success_count > 0 if __name__ == "__main__": input_dir = '../static/Txt' output_dir = '../Txt' process_directory(input_dir, output_dir)