parent
1ac102f688
commit
daed5693f3
@ -0,0 +1,64 @@
|
|||||||
|
from docx import Document
|
||||||
|
import os
|
||||||
|
|
||||||
|
# 读取文档并按一级标题中的"少年读史记"拆分文档
|
||||||
|
def split_docx_by_heading(input_path):
|
||||||
|
# 打开文档
|
||||||
|
doc = Document(input_path)
|
||||||
|
sections = []
|
||||||
|
current_section = []
|
||||||
|
start_flag = False
|
||||||
|
split_keyword = "少年读史记"
|
||||||
|
|
||||||
|
# 遍历所有段落,按一级标题拆分
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
# 检查是否为一级标题且包含关键字
|
||||||
|
if para.style.name == 'Heading 1' and split_keyword in para.text:
|
||||||
|
if start_flag:
|
||||||
|
# 保存当前章节并开始新章节
|
||||||
|
sections.append(current_section)
|
||||||
|
current_section = [para]
|
||||||
|
else:
|
||||||
|
# 找到第一个起始点
|
||||||
|
start_flag = True
|
||||||
|
current_section = [para]
|
||||||
|
elif start_flag:
|
||||||
|
# 添加内容到当前章节
|
||||||
|
current_section.append(para)
|
||||||
|
|
||||||
|
# 添加最后一个章节
|
||||||
|
if start_flag and current_section:
|
||||||
|
sections.append(current_section)
|
||||||
|
|
||||||
|
# 保存拆分后的文档
|
||||||
|
output_dir = os.path.dirname(input_path)
|
||||||
|
for i, section in enumerate(sections, 1):
|
||||||
|
new_doc = Document()
|
||||||
|
for para in section:
|
||||||
|
# 复制段落内容和样式
|
||||||
|
new_para = new_doc.add_paragraph(para.text)
|
||||||
|
new_para.style = para.style
|
||||||
|
# 复制段落中的_run格式
|
||||||
|
for run in para.runs:
|
||||||
|
new_run = new_para.add_run(run.text)
|
||||||
|
new_run.bold = run.bold
|
||||||
|
new_run.italic = run.italic
|
||||||
|
new_run.underline = run.underline
|
||||||
|
new_run.font.size = run.font.size
|
||||||
|
new_run.font.name = run.font.name
|
||||||
|
# 生成输出文件名
|
||||||
|
output_filename = f"ShiJi_{i}.docx"
|
||||||
|
output_path = os.path.join(output_dir, output_filename)
|
||||||
|
new_doc.save(output_path)
|
||||||
|
print(f"已保存拆分文档: {output_path}")
|
||||||
|
|
||||||
|
return len(sections)
|
||||||
|
|
||||||
|
# 主执行逻辑
|
||||||
|
if __name__ == "__main__":
|
||||||
|
file = r'D:\dsWork\dsProject\dsLightRag\static\Txt\ShiJi.docx'
|
||||||
|
if not os.path.exists(file):
|
||||||
|
print(f"错误: 文件不存在 - {file}")
|
||||||
|
else:
|
||||||
|
section_count = split_docx_by_heading(file)
|
||||||
|
print(f"文档拆分完成,共生成 {section_count} 个章节文件")
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue