You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
import docx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SplitDocxUtil:
|
|
|
|
|
@staticmethod
|
|
|
|
|
def read_docx(file_path):
|
|
|
|
|
"""读取docx文件内容"""
|
|
|
|
|
try:
|
|
|
|
|
doc = docx.Document(file_path)
|
|
|
|
|
return "\n".join([para.text for para in doc.paragraphs if para.text])
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"读取docx文件出错: {str(e)}")
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def split_text(text, context_size=200):
|
|
|
|
|
"""
|
|
|
|
|
改进的段落分割函数,兼容单/双换行符
|
|
|
|
|
"""
|
|
|
|
|
# 先标准化换行符为双换行
|
|
|
|
|
normalized_text = text.replace('\r\n', '\n').replace('\n\n', '\n')
|
|
|
|
|
# 按段落分割(兼容连续换行)
|
|
|
|
|
raw_paragraphs = [p.strip() for p in normalized_text.split('\n') if p.strip()]
|
|
|
|
|
|
|
|
|
|
chunks = []
|
|
|
|
|
for i, para in enumerate(raw_paragraphs):
|
|
|
|
|
if not para:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 获取上下文
|
|
|
|
|
prev_context = raw_paragraphs[i-1][-context_size:] if i > 0 else ""
|
|
|
|
|
next_context = raw_paragraphs[i+1][:context_size] if i < len(raw_paragraphs)-1 else ""
|
|
|
|
|
|
|
|
|
|
chunks.append((i+1, f"{prev_context}\n\n{para}\n\n{next_context}"))
|
|
|
|
|
|
|
|
|
|
return chunks
|