You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

36 lines
1.3 KiB

import docx
class SplitDocxUtil:
@staticmethod
def read_docx(file_path):
"""读取docx文件内容"""
try:
doc = docx.Document(file_path)
return "\n".join([para.text for para in doc.paragraphs if para.text])
except Exception as e:
print(f"读取docx文件出错: {str(e)}")
return ""
@staticmethod
def split_text(text, context_size=200):
"""
改进的段落分割函数,兼容单/双换行符
"""
# 先标准化换行符为双换行
normalized_text = text.replace('\r\n', '\n').replace('\n\n', '\n')
# 按段落分割(兼容连续换行)
raw_paragraphs = [p.strip() for p in normalized_text.split('\n') if p.strip()]
chunks = []
for i, para in enumerate(raw_paragraphs):
if not para:
continue
# 获取上下文
prev_context = raw_paragraphs[i-1][-context_size:] if i > 0 else ""
next_context = raw_paragraphs[i+1][:context_size] if i < len(raw_paragraphs)-1 else ""
chunks.append((i+1, f"{prev_context}\n\n{para}\n\n{next_context}"))
return chunks