import docx class SplitDocxUtil: @staticmethod def read_docx(file_path): """读取docx文件内容""" try: doc = docx.Document(file_path) return "\n".join([para.text for para in doc.paragraphs if para.text]) except Exception as e: print(f"读取docx文件出错: {str(e)}") return "" @staticmethod def split_text(text, context_size=200): """ 改进的段落分割函数,兼容单/双换行符 """ # 先标准化换行符为双换行 normalized_text = text.replace('\r\n', '\n').replace('\n\n', '\n') # 按段落分割(兼容连续换行) raw_paragraphs = [p.strip() for p in normalized_text.split('\n') if p.strip()] chunks = [] for i, para in enumerate(raw_paragraphs): if not para: continue # 获取上下文 prev_context = raw_paragraphs[i-1][-context_size:] if i > 0 else "" next_context = raw_paragraphs[i+1][:context_size] if i < len(raw_paragraphs)-1 else "" chunks.append((i+1, f"{prev_context}\n\n{para}\n\n{next_context}")) return chunks