From d2ed3cb840c584e05e133f9a31d173d370587017 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Sat, 28 Jun 2025 10:00:22 +0800 Subject: [PATCH] 'commit' --- dsRag/Test/T2_read_word_content.py | 40 +++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/dsRag/Test/T2_read_word_content.py b/dsRag/Test/T2_read_word_content.py index 06bf627a..0e32002b 100644 --- a/dsRag/Test/T2_read_word_content.py +++ b/dsRag/Test/T2_read_word_content.py @@ -2,6 +2,7 @@ import docx def read_word_content(docx_path): + res = "" idx = 0 """遍历Word文档的每个段落,输出文字或图片标识""" try: @@ -21,14 +22,45 @@ def read_word_content(docx_path): if has_image: idx = idx + 1 - print("【图片" + str(idx) + "】") + res = res + "\n" + "【图片" + str(idx) + "】" elif paragraph.text.strip(): - print(paragraph.text.strip()) - + res = res + "\n" + paragraph.text.strip() + return res except Exception as e: print(f"处理Word文档时出错: {str(e)}") +def split_into_blocks(text): + """按行遍历文本,发现'问题X'或'话题X'时开始分割,只移除前缀但保留整行内容""" + blocks = [] + current_block = [] + in_block = False + + for line in text.splitlines(): + if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]): + if in_block: + blocks.append('\n'.join(current_block)) + current_block = [] + in_block = True + # 循环移除问题和话题前缀后的数字 + while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())): + if line.startswith(('问题', '话题')): + line = line[2:] if len(line) > 2 else line + elif line and line[0].isdigit(): + line = line[1:] if len(line) > 1 else line + line = line.strip() + if in_block and line: # 只添加非空行 + current_block.append(line) + + if current_block: + blocks.append('\n'.join(current_block)) + + return [(i + 1, block) for i, block in enumerate(blocks)] + + if __name__ == "__main__": word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx" - read_word_content(word_document_path) + res = read_word_content(word_document_path) + q = split_into_blocks(res) + for x in q: + print(x[1])