From d2ed3cb840c584e05e133f9a31d173d370587017 Mon Sep 17 00:00:00 2001
From: HuangHai <10402852@qq.com>
Date: Sat, 28 Jun 2025 10:00:22 +0800
Subject: [PATCH] 'commit'

---
 dsRag/Test/T2_read_word_content.py | 40 +++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/dsRag/Test/T2_read_word_content.py b/dsRag/Test/T2_read_word_content.py
index 06bf627a..0e32002b 100644
--- a/dsRag/Test/T2_read_word_content.py
+++ b/dsRag/Test/T2_read_word_content.py
@@ -2,6 +2,7 @@ import docx
 
 
 def read_word_content(docx_path):
+    res = ""
     idx = 0
     """遍历Word文档的每个段落，输出文字或图片标识"""
     try:
@@ -21,14 +22,45 @@ def read_word_content(docx_path):
 
             if has_image:
                 idx = idx + 1
-                print("【图片" + str(idx) + "】")
+                res = res + "\n" + "【图片" + str(idx) + "】"
             elif paragraph.text.strip():
-                print(paragraph.text.strip())
-
+                res = res + "\n" + paragraph.text.strip()
+        return res
     except Exception as e:
         print(f"处理Word文档时出错: {str(e)}")
 
 
+def split_into_blocks(text):
+    """按行遍历文本，发现'问题X'或'话题X'时开始分割，只移除前缀但保留整行内容"""
+    blocks = []
+    current_block = []
+    in_block = False
+
+    for line in text.splitlines():
+        if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
+            if in_block:
+                blocks.append('\n'.join(current_block))
+                current_block = []
+            in_block = True
+            # 循环移除问题和话题前缀后的数字
+            while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
+                if line.startswith(('问题', '话题')):
+                    line = line[2:] if len(line) > 2 else line
+                elif line and line[0].isdigit():
+                    line = line[1:] if len(line) > 1 else line
+                line = line.strip()
+        if in_block and line:  # 只添加非空行
+            current_block.append(line)
+
+    if current_block:
+        blocks.append('\n'.join(current_block))
+
+    return [(i + 1, block) for i, block in enumerate(blocks)]
+
+
 if __name__ == "__main__":
     word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx"
-    read_word_content(word_document_path)
+    res = read_word_content(word_document_path)
+    q = split_into_blocks(res)
+    for x in q:
+        print(x[1])