import docx def read_word_content(docx_path): res = "" idx = 0 """遍历Word文档的每个段落,输出文字或图片标识""" try: doc = docx.Document(docx_path) for paragraph in doc.paragraphs: has_image = False # 检查段落中是否有图片 for run in paragraph.runs: for element in run._element: if element.tag.endswith('drawing'): # 找到图片元素 has_image = True break if has_image: break if has_image: idx = idx + 1 res = res + "\n" + "【图片" + str(idx) + "】" elif paragraph.text.strip(): res = res + "\n" + paragraph.text.strip() return res except Exception as e: print(f"处理Word文档时出错: {str(e)}") def split_into_blocks(text): """按行遍历文本,发现'问题X'或'话题X'时开始分割,只移除前缀但保留整行内容""" blocks = [] current_block = [] in_block = False for line in text.splitlines(): if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]): if in_block: blocks.append('\n'.join(current_block)) current_block = [] in_block = True # 循环移除问题和话题前缀后的数字 while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())): if line.startswith(('问题', '话题')): line = line[2:] if len(line) > 2 else line elif line and line[0].isdigit(): line = line[1:] if len(line) > 1 else line line = line.strip() if in_block and line: # 只添加非空行 current_block.append(line) if current_block: blocks.append('\n'.join(current_block)) return [(i + 1, block) for i, block in enumerate(blocks)] if __name__ == "__main__": word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx" res = read_word_content(word_document_path) q = split_into_blocks(res) for x in q: print(x[1])