You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
65 lines
2.3 KiB
65 lines
2.3 KiB
from docx import Document
|
|
import os
|
|
|
|
# 读取文档并按一级标题中的"少年读史记"拆分文档
|
|
def split_docx_by_heading(input_path):
|
|
# 打开文档
|
|
doc = Document(input_path)
|
|
sections = []
|
|
current_section = []
|
|
start_flag = False
|
|
split_keyword = "少年读史记"
|
|
|
|
# 遍历所有段落,按一级标题拆分
|
|
for para in doc.paragraphs:
|
|
# 检查是否为一级标题且包含关键字
|
|
if para.style.name == 'Heading 1' and split_keyword in para.text:
|
|
if start_flag:
|
|
# 保存当前章节并开始新章节
|
|
sections.append(current_section)
|
|
current_section = [para]
|
|
else:
|
|
# 找到第一个起始点
|
|
start_flag = True
|
|
current_section = [para]
|
|
elif start_flag:
|
|
# 添加内容到当前章节
|
|
current_section.append(para)
|
|
|
|
# 添加最后一个章节
|
|
if start_flag and current_section:
|
|
sections.append(current_section)
|
|
|
|
# 保存拆分后的文档
|
|
output_dir = os.path.dirname(input_path)
|
|
for i, section in enumerate(sections, 1):
|
|
new_doc = Document()
|
|
for para in section:
|
|
# 复制段落内容和样式
|
|
new_para = new_doc.add_paragraph(para.text)
|
|
new_para.style = para.style
|
|
# 复制段落中的_run格式
|
|
for run in para.runs:
|
|
new_run = new_para.add_run(run.text)
|
|
new_run.bold = run.bold
|
|
new_run.italic = run.italic
|
|
new_run.underline = run.underline
|
|
new_run.font.size = run.font.size
|
|
new_run.font.name = run.font.name
|
|
# 生成输出文件名
|
|
output_filename = f"ShiJi_{i}.docx"
|
|
output_path = os.path.join(output_dir, output_filename)
|
|
new_doc.save(output_path)
|
|
print(f"已保存拆分文档: {output_path}")
|
|
|
|
return len(sections)
|
|
|
|
# 主执行逻辑
|
|
if __name__ == "__main__":
|
|
file = r'D:\dsWork\dsProject\dsLightRag\static\Txt\ShiJi.docx'
|
|
if not os.path.exists(file):
|
|
print(f"错误: 文件不存在 - {file}")
|
|
else:
|
|
section_count = split_docx_by_heading(file)
|
|
print(f"文档拆分完成,共生成 {section_count} 个章节文件")
|