from docx import Document import os # 读取文档并按一级标题中的"少年读史记"拆分文档 def split_docx_by_heading(input_path): # 打开文档 doc = Document(input_path) sections = [] current_section = [] start_flag = False split_keyword = "少年读史记" # 遍历所有段落,按一级标题拆分 for para in doc.paragraphs: # 检查是否为一级标题且包含关键字 if para.style.name == 'Heading 1' and split_keyword in para.text: if start_flag: # 保存当前章节并开始新章节 sections.append(current_section) current_section = [para] else: # 找到第一个起始点 start_flag = True current_section = [para] elif start_flag: # 添加内容到当前章节 current_section.append(para) # 添加最后一个章节 if start_flag and current_section: sections.append(current_section) # 保存拆分后的文档 output_dir = os.path.dirname(input_path) for i, section in enumerate(sections, 1): new_doc = Document() for para in section: # 复制段落内容和样式 new_para = new_doc.add_paragraph(para.text) new_para.style = para.style # 复制段落中的_run格式 for run in para.runs: new_run = new_para.add_run(run.text) new_run.bold = run.bold new_run.italic = run.italic new_run.underline = run.underline new_run.font.size = run.font.size new_run.font.name = run.font.name # 生成输出文件名 output_filename = f"ShiJi_{i}.docx" output_path = os.path.join(output_dir, output_filename) new_doc.save(output_path) print(f"已保存拆分文档: {output_path}") return len(sections) # 主执行逻辑 if __name__ == "__main__": file = r'D:\dsWork\dsProject\dsLightRag\static\Txt\ShiJi.docx' if not os.path.exists(file): print(f"错误: 文件不存在 - {file}") else: section_count = split_docx_by_heading(file) print(f"文档拆分完成,共生成 {section_count} 个章节文件")