You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

53 lines
1.8 KiB

1 month ago
import re
1 month ago
import os
from Util.SplitDocxUtil import SplitDocxUtil
1 month ago
def split_into_blocks(text):
"""使用正则表达式匹配问题和话题的标题及内容"""
pattern = r'(问题\d+|话题\d+)([\s\S]+?)(?=问题\d+|话题\d+|$)'
blocks = re.findall(pattern, text, re.DOTALL)
return [(i+1, title + content) for i, (title, content) in enumerate(blocks)]
1 month ago
def process_document(input_path, output_dir):
"""处理文档主函数"""
text = SplitDocxUtil.read_docx(input_path)
if not text:
print("无法读取输入文件内容")
return False
1 month ago
# 确保输出目录存在并清空目录
if os.path.exists(output_dir):
for file in os.listdir(output_dir):
os.remove(os.path.join(output_dir, file))
1 month ago
os.makedirs(output_dir, exist_ok=True)
1 month ago
chunks = split_into_blocks(text)
1 month ago
print(f"共分割出{len(chunks)}个段落块")
1 month ago
saved_count = 0
1 month ago
for chunk_num, chunk in chunks:
1 month ago
chunk = chunk.strip() # 确保去除空白字符
1 month ago
output_file = os.path.join(output_dir, f"{chunk_num}.txt")
1 month ago
if save_to_txt(chunk, output_file, mode='w'):
saved_count += 1
print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}")
return saved_count > 0
1 month ago
1 month ago
# 保留原有的save_to_txt函数
1 month ago
def save_to_txt(content, file_path, mode='w'):
"""将内容保存到文本文件"""
try:
with open(file_path, mode, encoding='utf-8') as f:
f.write(content)
return True
except Exception as e:
print(f"保存文件{file_path}时出错: {str(e)}")
return False
if __name__ == "__main__":
input_file = '../Txt/小学数学(史校长).docx'
output_dir = '../Txt/processed_chunks'
process_document(input_file, output_dir)