from Util.WordUtil import * import re def split_text_blocks(text): """ 根据三种规则分割文本: 1. 以'一、'到'二十、'等序号开头则按序号分割 2. 只有一个段落则按句子分割 3. 有多个段落且有完整空行则按空行分割 """ # 规则1:检查是否有序号标题(支持一到二十) if re.search(r'^[一二三四五六七八九十]、|^十[一二三四五六七八九]、|^二十、', text, flags=re.MULTILINE): blocks = re.split(r'(^[一二三四五六七八九十]、.*\n|^十[一二三四五六七八九]、.*\n|^二十、.*\n)', text, flags=re.MULTILINE) # 合并标题和内容 return [blocks[i] + blocks[i + 1] for i in range(1, len(blocks), 2)] # 规则3:检查是否有完整空行 paragraphs = text.split('\n\n') if len(paragraphs) > 1: return paragraphs # 规则2:按句子分割 return re.split(r'(?<=[。!?])\s+', text) # 测试代码 if __name__ == "__main__": doc_path = "../Txt/小学数学知识点.docx" text = read_word_file(doc_path) knowledge = split_text_blocks(text) idx = 0 for x in knowledge: idx = idx + 1 print(f"发现新的分块:{idx}") print(x)