|
|
|
@ -0,0 +1,37 @@
|
|
|
|
|
from Util.WordUtil import *
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def split_text_blocks(text):
|
|
|
|
|
"""
|
|
|
|
|
根据三种规则分割文本:
|
|
|
|
|
1. 以'一、'到'二十、'等序号开头则按序号分割
|
|
|
|
|
2. 只有一个段落则按句子分割
|
|
|
|
|
3. 有多个段落且有完整空行则按空行分割
|
|
|
|
|
"""
|
|
|
|
|
# 规则1:检查是否有序号标题(支持一到二十)
|
|
|
|
|
if re.search(r'^[一二三四五六七八九十]、|^十[一二三四五六七八九]、|^二十、', text, flags=re.MULTILINE):
|
|
|
|
|
blocks = re.split(r'(^[一二三四五六七八九十]、.*\n|^十[一二三四五六七八九]、.*\n|^二十、.*\n)', text,
|
|
|
|
|
flags=re.MULTILINE)
|
|
|
|
|
# 合并标题和内容
|
|
|
|
|
return [blocks[i] + blocks[i + 1] for i in range(1, len(blocks), 2)]
|
|
|
|
|
|
|
|
|
|
# 规则3:检查是否有完整空行
|
|
|
|
|
paragraphs = text.split('\n\n')
|
|
|
|
|
if len(paragraphs) > 1:
|
|
|
|
|
return paragraphs
|
|
|
|
|
|
|
|
|
|
# 规则2:按句子分割
|
|
|
|
|
return re.split(r'(?<=[。!?])\s+', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 测试代码
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
doc_path = "../Txt/小学数学知识点.docx"
|
|
|
|
|
text = read_word_file(doc_path)
|
|
|
|
|
knowledge = split_text_blocks(text)
|
|
|
|
|
idx = 0
|
|
|
|
|
for x in knowledge:
|
|
|
|
|
idx = idx + 1
|
|
|
|
|
print(f"发现新的分块:{idx}")
|
|
|
|
|
print(x)
|