import os import docx def read_word_content(docx_path): res = "" idx = 0 """遍历Word文档的每个段落,输出文字或图片标识""" try: doc = docx.Document(docx_path) for paragraph in doc.paragraphs: has_image = False # 检查段落中是否有图片 for run in paragraph.runs: for element in run._element: if element.tag.endswith('drawing'): # 找到图片元素 has_image = True break if has_image: break if has_image: idx = idx + 1 res = res + "\n" + "【图片" + str(idx) + "】" elif paragraph.text.strip(): res = res + "\n" + paragraph.text.strip() return res except Exception as e: print(f"处理Word文档时出错: {str(e)}") def split_into_blocks(text): """按行遍历文本,发现'问题X'或'话题X'时开始分割,只移除前缀但保留整行内容""" blocks = [] current_block = [] in_block = False for line in text.splitlines(): if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]): if in_block: blocks.append('\n'.join(current_block)) current_block = [] in_block = True # 循环移除问题和话题前缀后的数字 while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())): if line.startswith(('问题', '话题')): line = line[2:] if len(line) > 2 else line elif line and line[0].isdigit(): line = line[1:] if len(line) > 1 else line line = line.strip() if in_block and line: # 只添加非空行 current_block.append(line) if current_block: blocks.append('\n'.join(current_block)) return [(i + 1, block) for i, block in enumerate(blocks)] # 保留原有的save_to_txt函数 def save_to_txt(content, file_path, mode='w'): """将内容保存到文本文件""" try: with open(file_path, mode, encoding='utf-8') as f: f.write(content) return True except Exception as e: print(f"保存文件{file_path}时出错: {str(e)}") return False if __name__ == "__main__": word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx" output_dir="d:\\dsWork\\dsProject\\dsRag\\static\\Test\\" res = read_word_content(word_document_path) chunks = split_into_blocks(res) for x in chunks: print("===段落开始:===") firstLine = x[1].split("\n")[0].strip() content = x[1][len(firstLine):].strip() print("firstLine=" + firstLine) print("content=" + content) print("===段落结束:===\n") saved_count=0 for chunk_num, chunk in chunks: chunk = chunk.strip() # 确保去除空白字符 output_file = os.path.join(output_dir, f"{chunk_num}.txt") if save_to_txt(chunk, output_file, mode='w'): saved_count += 1 print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}")