diff --git a/dsRag/Test/T10_ReadMathWord.py b/dsRag/Test/T10_ReadMathWord.py new file mode 100644 index 00000000..f8444492 --- /dev/null +++ b/dsRag/Test/T10_ReadMathWord.py @@ -0,0 +1,37 @@ +from Util.WordUtil import * +import re + + +def split_text_blocks(text): + """ + 根据三种规则分割文本: + 1. 以'一、'到'二十、'等序号开头则按序号分割 + 2. 只有一个段落则按句子分割 + 3. 有多个段落且有完整空行则按空行分割 + """ + # 规则1:检查是否有序号标题(支持一到二十) + if re.search(r'^[一二三四五六七八九十]、|^十[一二三四五六七八九]、|^二十、', text, flags=re.MULTILINE): + blocks = re.split(r'(^[一二三四五六七八九十]、.*\n|^十[一二三四五六七八九]、.*\n|^二十、.*\n)', text, + flags=re.MULTILINE) + # 合并标题和内容 + return [blocks[i] + blocks[i + 1] for i in range(1, len(blocks), 2)] + + # 规则3:检查是否有完整空行 + paragraphs = text.split('\n\n') + if len(paragraphs) > 1: + return paragraphs + + # 规则2:按句子分割 + return re.split(r'(?<=[。!?])\s+', text) + + +# 测试代码 +if __name__ == "__main__": + doc_path = "../Txt/小学数学知识点.docx" + text = read_word_file(doc_path) + knowledge = split_text_blocks(text) + idx = 0 + for x in knowledge: + idx = idx + 1 + print(f"发现新的分块:{idx}") + print(x) diff --git a/dsRag/Test/T8_TestReadWord.py b/dsRag/Test/T8_TestReadWord.py index f870d5c6..178afebe 100644 --- a/dsRag/Test/T8_TestReadWord.py +++ b/dsRag/Test/T8_TestReadWord.py @@ -6,7 +6,7 @@ from Util.WordUtil import * # 使用示例 if __name__ == "__main__": - word_file = R"D:\办公\【曲靖】技术参数_教育数据基座20250525(1).docx" + word_file = r"../Txt/小学数学知识点.docx" content = read_word_file(word_file) if content: # 逐行打印,空行不打印 diff --git a/dsRag/Txt/小学数学知识点.docx b/dsRag/Txt/小学数学知识点.docx new file mode 100644 index 00000000..5b9cdc33 Binary files /dev/null and b/dsRag/Txt/小学数学知识点.docx differ