You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

38 lines
1.3 KiB

1 month ago
from Util.WordUtil import *
import re
def split_text_blocks(text):
"""
根据三种规则分割文本
1. '一、''二十、'等序号开头则按序号分割
2. 只有一个段落则按句子分割
3. 有多个段落且有完整空行则按空行分割
"""
# 规则1检查是否有序号标题支持一到二十
if re.search(r'^[一二三四五六七八九十]、|^十[一二三四五六七八九]、|^二十、', text, flags=re.MULTILINE):
blocks = re.split(r'(^[一二三四五六七八九十]、.*\n|^十[一二三四五六七八九]、.*\n|^二十、.*\n)', text,
flags=re.MULTILINE)
# 合并标题和内容
return [blocks[i] + blocks[i + 1] for i in range(1, len(blocks), 2)]
# 规则3检查是否有完整空行
paragraphs = text.split('\n\n')
if len(paragraphs) > 1:
return paragraphs
# 规则2按句子分割
return re.split(r'(?<=[。!?])\s+', text)
# 测试代码
if __name__ == "__main__":
doc_path = "../Txt/小学数学知识点.docx"
text = read_word_file(doc_path)
knowledge = split_text_blocks(text)
idx = 0
for x in knowledge:
idx = idx + 1
print(f"发现新的分块:{idx}")
print(x)