'commit'

4 weeks ago · 14c5906c3f
parent af999ae927
commit 14c5906c3f
3 changed files with 38 additions and 1 deletions
--- a/dsRag/Test/T10_ReadMathWord.py
+++ b/dsRag/Test/T10_ReadMathWord.py
@ -0,0 +1,37 @@
+from Util.WordUtil import *
+import re
+
+
+def split_text_blocks(text):
+    """
+    根据三种规则分割文本：
+    1. 以'一、'到'二十、'等序号开头则按序号分割
+    2. 只有一个段落则按句子分割
+    3. 有多个段落且有完整空行则按空行分割
+    """
+    # 规则1：检查是否有序号标题（支持一到二十）
+    if re.search(r'^[一二三四五六七八九十]、|^十[一二三四五六七八九]、|^二十、', text, flags=re.MULTILINE):
+        blocks = re.split(r'(^[一二三四五六七八九十]、.*\n|^十[一二三四五六七八九]、.*\n|^二十、.*\n)', text,
+                          flags=re.MULTILINE)
+        # 合并标题和内容
+        return [blocks[i] + blocks[i + 1] for i in range(1, len(blocks), 2)]
+
+    # 规则3：检查是否有完整空行
+    paragraphs = text.split('\n\n')
+    if len(paragraphs) > 1:
+        return paragraphs
+
+    # 规则2：按句子分割
+    return re.split(r'(?<=[。！？])\s+', text)
+
+
+# 测试代码
+if __name__ == "__main__":
+    doc_path = "../Txt/小学数学知识点.docx"
+    text = read_word_file(doc_path)
+    knowledge = split_text_blocks(text)
+    idx = 0
+    for x in knowledge:
+        idx = idx + 1
+        print(f"发现新的分块：{idx}")
+        print(x)
--- a/dsRag/Test/T8_TestReadWord.py
+++ b/dsRag/Test/T8_TestReadWord.py
@ -6,7 +6,7 @@ from Util.WordUtil import *

 # 使用示例
 if __name__ == "__main__":
-    word_file = R"D:\办公\【曲靖】技术参数_教育数据基座20250525(1).docx"
+    word_file = r"../Txt/小学数学知识点.docx"
    content = read_word_file(word_file)
    if content:
        # 逐行打印，空行不打印
--- a/dsRag/Txt/小学数学知识点.docx
+++ b/dsRag/Txt/小学数学知识点.docx