|
|
|
@ -1,3 +1,5 @@
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
import docx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -58,14 +60,35 @@ def split_into_blocks(text):
|
|
|
|
|
return [(i + 1, block) for i, block in enumerate(blocks)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 保留原有的save_to_txt函数
|
|
|
|
|
def save_to_txt(content, file_path, mode='w'):
|
|
|
|
|
"""将内容保存到文本文件"""
|
|
|
|
|
try:
|
|
|
|
|
with open(file_path, mode, encoding='utf-8') as f:
|
|
|
|
|
f.write(content)
|
|
|
|
|
return True
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"保存文件{file_path}时出错: {str(e)}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx"
|
|
|
|
|
output_dir="d:\\dsWork\\dsProject\\dsRag\\static\\Test\\"
|
|
|
|
|
res = read_word_content(word_document_path)
|
|
|
|
|
q = split_into_blocks(res)
|
|
|
|
|
for x in q:
|
|
|
|
|
chunks = split_into_blocks(res)
|
|
|
|
|
for x in chunks:
|
|
|
|
|
print("===段落开始:===")
|
|
|
|
|
firstLine = x[1].split("\n")[0].strip()
|
|
|
|
|
content = x[1][len(firstLine):].strip()
|
|
|
|
|
print("firstLine=" + firstLine)
|
|
|
|
|
print("content=" + content)
|
|
|
|
|
print("===段落结束:===\n")
|
|
|
|
|
|
|
|
|
|
saved_count=0
|
|
|
|
|
for chunk_num, chunk in chunks:
|
|
|
|
|
chunk = chunk.strip() # 确保去除空白字符
|
|
|
|
|
output_file = os.path.join(output_dir, f"{chunk_num}.txt")
|
|
|
|
|
if save_to_txt(chunk, output_file, mode='w'):
|
|
|
|
|
saved_count += 1
|
|
|
|
|
|
|
|
|
|
print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}")
|
|
|
|
|