main
HuangHai 4 weeks ago
parent af999ae927
commit 14c5906c3f

@ -0,0 +1,37 @@
from Util.WordUtil import *
import re
def split_text_blocks(text):
"""
根据三种规则分割文本
1. '一、''二十、'等序号开头则按序号分割
2. 只有一个段落则按句子分割
3. 有多个段落且有完整空行则按空行分割
"""
# 规则1检查是否有序号标题支持一到二十
if re.search(r'^[一二三四五六七八九十]、|^十[一二三四五六七八九]、|^二十、', text, flags=re.MULTILINE):
blocks = re.split(r'(^[一二三四五六七八九十]、.*\n|^十[一二三四五六七八九]、.*\n|^二十、.*\n)', text,
flags=re.MULTILINE)
# 合并标题和内容
return [blocks[i] + blocks[i + 1] for i in range(1, len(blocks), 2)]
# 规则3检查是否有完整空行
paragraphs = text.split('\n\n')
if len(paragraphs) > 1:
return paragraphs
# 规则2按句子分割
return re.split(r'(?<=[。!?])\s+', text)
# 测试代码
if __name__ == "__main__":
doc_path = "../Txt/小学数学知识点.docx"
text = read_word_file(doc_path)
knowledge = split_text_blocks(text)
idx = 0
for x in knowledge:
idx = idx + 1
print(f"发现新的分块:{idx}")
print(x)

@ -6,7 +6,7 @@ from Util.WordUtil import *
# 使用示例
if __name__ == "__main__":
word_file = R"D:\办公\【曲靖】技术参数_教育数据基座20250525(1).docx"
word_file = r"../Txt/小学数学知识点.docx"
content = read_word_file(word_file)
if content:
# 逐行打印,空行不打印

Loading…
Cancel
Save