dsProject/dsLightRag/Tools/T2_SpliteDocx.py

from docx import Document
import os

# 读取文档并按一级标题中的"少年读史记"拆分文档
def split_docx_by_heading(input_path):
    # 打开文档
    doc = Document(input_path)
    sections = []
    current_section = []
    start_flag = False
    split_keyword = "少年读史记"

    # 遍历所有段落，按一级标题拆分
    for para in doc.paragraphs:
        # 检查是否为一级标题且包含关键字
        if para.style.name == 'Heading 1' and split_keyword in para.text:
            if start_flag:
                # 保存当前章节并开始新章节
                sections.append(current_section)
                current_section = [para]
            else:
                # 找到第一个起始点
                start_flag = True
                current_section = [para]
        elif start_flag:
            # 添加内容到当前章节
            current_section.append(para)

    # 添加最后一个章节
    if start_flag and current_section:
        sections.append(current_section)

    # 保存拆分后的文档
    output_dir = os.path.dirname(input_path)
    for i, section in enumerate(sections, 1):
        new_doc = Document()
        for para in section:
            # 复制段落内容和样式
            new_para = new_doc.add_paragraph(para.text)
            new_para.style = para.style
            # 复制段落中的_run格式
            for run in para.runs:
                new_run = new_para.add_run(run.text)
                new_run.bold = run.bold
                new_run.italic = run.italic
                new_run.underline = run.underline
                new_run.font.size = run.font.size
                new_run.font.name = run.font.name
        # 生成输出文件名
        output_filename = f"ShiJi_{i}.docx"
        output_path = os.path.join(output_dir, output_filename)
        new_doc.save(output_path)
        print(f"已保存拆分文档: {output_path}")

    return len(sections)

# 主执行逻辑
if __name__ == "__main__":
    file = r'D:\dsWork\dsProject\dsLightRag\static\Txt\ShiJi.docx'
    if not os.path.exists(file):
        print(f"错误: 文件不存在 - {file}")
    else:
        section_count = split_docx_by_heading(file)
        print(f"文档拆分完成，共生成 {section_count} 个章节文件")