You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
72 lines
2.5 KiB
72 lines
2.5 KiB
4 weeks ago
|
import os
|
||
|
|
||
|
from Util.SplitDocxUtil import SplitDocxUtil
|
||
|
|
||
4 weeks ago
|
|
||
4 weeks ago
|
def split_into_blocks(text):
|
||
4 weeks ago
|
"""按行遍历文本,发现'问题X'或'话题X'时开始分割,但去除这些前缀字符串"""
|
||
4 weeks ago
|
blocks = []
|
||
|
current_block = []
|
||
|
in_block = False
|
||
|
|
||
|
for line in text.splitlines():
|
||
|
if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
|
||
|
if in_block:
|
||
|
blocks.append('\n'.join(current_block))
|
||
|
current_block = []
|
||
|
in_block = True
|
||
4 weeks ago
|
# 去除前缀字符串
|
||
|
line = line[line.find(' ')+1:] if ' ' in line else ''
|
||
4 weeks ago
|
|
||
4 weeks ago
|
if in_block and line: # 只添加非空行
|
||
4 weeks ago
|
current_block.append(line)
|
||
|
|
||
|
if current_block:
|
||
|
blocks.append('\n'.join(current_block))
|
||
|
|
||
|
return [(i+1, block) for i, block in enumerate(blocks)]
|
||
4 weeks ago
|
|
||
|
def process_document(input_path, output_dir):
|
||
|
"""处理文档主函数"""
|
||
|
text = SplitDocxUtil.read_docx(input_path)
|
||
|
if not text:
|
||
|
print("无法读取输入文件内容")
|
||
|
return False
|
||
|
|
||
4 weeks ago
|
# 确保输出目录存在并清空目录
|
||
|
if os.path.exists(output_dir):
|
||
|
for file in os.listdir(output_dir):
|
||
|
os.remove(os.path.join(output_dir, file))
|
||
4 weeks ago
|
os.makedirs(output_dir, exist_ok=True)
|
||
4 weeks ago
|
|
||
|
chunks = split_into_blocks(text)
|
||
4 weeks ago
|
print(f"共分割出{len(chunks)}个段落块")
|
||
4 weeks ago
|
|
||
|
saved_count = 0
|
||
4 weeks ago
|
for chunk_num, chunk in chunks:
|
||
4 weeks ago
|
chunk = chunk.strip() # 确保去除空白字符
|
||
4 weeks ago
|
output_file = os.path.join(output_dir, f"{chunk_num}.txt")
|
||
4 weeks ago
|
if save_to_txt(chunk, output_file, mode='w'):
|
||
|
saved_count += 1
|
||
|
|
||
|
print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}")
|
||
|
return saved_count > 0
|
||
4 weeks ago
|
|
||
4 weeks ago
|
# 保留原有的save_to_txt函数
|
||
4 weeks ago
|
def save_to_txt(content, file_path, mode='w'):
|
||
|
"""将内容保存到文本文件"""
|
||
|
try:
|
||
|
with open(file_path, mode, encoding='utf-8') as f:
|
||
|
f.write(content)
|
||
|
return True
|
||
|
except Exception as e:
|
||
|
print(f"保存文件{file_path}时出错: {str(e)}")
|
||
|
return False
|
||
|
|
||
|
if __name__ == "__main__":
|
||
4 weeks ago
|
input_file = '../../static/Txt/小学数学教学中的若干问题_MATH_1.docx'
|
||
4 weeks ago
|
#input_file = '../static/Txt/小学数学知识点_MATH_2.docx'
|
||
|
#input_file = '../static/Txt/高中文言文_CHINESE_1.docx'
|
||
4 weeks ago
|
output_dir = '../Txt/processed_chunks'
|
||
|
process_document(input_file, output_dir)
|