|
|
|
@ -1,6 +1,106 @@
|
|
|
|
|
import re
|
|
|
|
|
import docx
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import shutil
|
|
|
|
|
import uuid
|
|
|
|
|
import zipfile
|
|
|
|
|
|
|
|
|
|
from docx import Document
|
|
|
|
|
from docx.oxml.ns import nsmap
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_images_from_docx(docx_path, output_folder):
|
|
|
|
|
"""
|
|
|
|
|
从docx提取图片并记录位置
|
|
|
|
|
:param docx_path: Word文档路径
|
|
|
|
|
:param output_folder: 图片输出文件夹
|
|
|
|
|
:return: 包含图片路径和位置的列表
|
|
|
|
|
"""
|
|
|
|
|
# 创建一个List<String> 记录每个图片的名称和序号
|
|
|
|
|
image_data = []
|
|
|
|
|
# 创建临时解压目录
|
|
|
|
|
temp_dir = os.path.join(output_folder, "temp_docx")
|
|
|
|
|
os.makedirs(temp_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
# 解压docx文件
|
|
|
|
|
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
|
|
|
|
|
zip_ref.extractall(temp_dir)
|
|
|
|
|
|
|
|
|
|
# 读取主文档关系
|
|
|
|
|
with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
|
|
|
|
|
rels_content = rels_file.read()
|
|
|
|
|
|
|
|
|
|
# 加载主文档
|
|
|
|
|
doc = Document(docx_path)
|
|
|
|
|
img_counter = 1
|
|
|
|
|
|
|
|
|
|
# 遍历所有段落
|
|
|
|
|
for para_idx, paragraph in enumerate(doc.paragraphs):
|
|
|
|
|
for run_idx, run in enumerate(paragraph.runs):
|
|
|
|
|
# 检查运行中的图形
|
|
|
|
|
for element in run._element:
|
|
|
|
|
if element.tag.endswith('drawing'):
|
|
|
|
|
# 提取图片关系ID
|
|
|
|
|
blip = element.find('.//a:blip', namespaces=nsmap)
|
|
|
|
|
if blip is not None:
|
|
|
|
|
embed_id = blip.get('{%s}embed' % nsmap['r'])
|
|
|
|
|
|
|
|
|
|
# 从关系文件中获取图片文件名
|
|
|
|
|
rel_entry = f'<Relationship Id="{embed_id}"'
|
|
|
|
|
if rel_entry in rels_content:
|
|
|
|
|
start = rels_content.find(rel_entry)
|
|
|
|
|
target_start = rels_content.find('Target="', start) + 8
|
|
|
|
|
target_end = rels_content.find('"', target_start)
|
|
|
|
|
image_path = rels_content[target_start:target_end]
|
|
|
|
|
|
|
|
|
|
# 构建图片源路径
|
|
|
|
|
src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
|
|
|
|
|
|
|
|
|
|
if os.path.exists(src_path):
|
|
|
|
|
# 创建输出文件名
|
|
|
|
|
ext = os.path.splitext(src_path)[1]
|
|
|
|
|
# 名称为uuid
|
|
|
|
|
fileName=uuid.uuid4().hex
|
|
|
|
|
img_name = f"{fileName}{ext}"
|
|
|
|
|
image_data.append(img_name)
|
|
|
|
|
dest_path = os.path.join(output_folder, img_name)
|
|
|
|
|
# 复制图片
|
|
|
|
|
shutil.copy(src_path, dest_path)
|
|
|
|
|
|
|
|
|
|
img_counter += 1
|
|
|
|
|
|
|
|
|
|
# 清理临时目录
|
|
|
|
|
shutil.rmtree(temp_dir)
|
|
|
|
|
return image_data
|
|
|
|
|
|
|
|
|
|
def read_word_content(docx_path):
|
|
|
|
|
res = ""
|
|
|
|
|
idx = 0
|
|
|
|
|
"""遍历Word文档的每个段落,输出文字或图片标识"""
|
|
|
|
|
try:
|
|
|
|
|
doc = docx.Document(docx_path)
|
|
|
|
|
|
|
|
|
|
for paragraph in doc.paragraphs:
|
|
|
|
|
has_image = False
|
|
|
|
|
# 检查段落中是否有图片
|
|
|
|
|
for run in paragraph.runs:
|
|
|
|
|
for element in run._element:
|
|
|
|
|
if element.tag.endswith('drawing'):
|
|
|
|
|
# 找到图片元素
|
|
|
|
|
has_image = True
|
|
|
|
|
break
|
|
|
|
|
if has_image:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
from Util.SplitDocxUtil import SplitDocxUtil
|
|
|
|
|
if has_image:
|
|
|
|
|
idx = idx + 1
|
|
|
|
|
res = res + "\n" + "【图片" + str(idx) + "】"
|
|
|
|
|
elif paragraph.text.strip():
|
|
|
|
|
res = res + "\n" + paragraph.text.strip()
|
|
|
|
|
return res
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"处理Word文档时出错: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def split_into_blocks(text):
|
|
|
|
@ -8,7 +108,7 @@ def split_into_blocks(text):
|
|
|
|
|
blocks = []
|
|
|
|
|
current_block = []
|
|
|
|
|
in_block = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for line in text.splitlines():
|
|
|
|
|
if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
|
|
|
|
|
if in_block:
|
|
|
|
@ -24,36 +124,12 @@ def split_into_blocks(text):
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if in_block and line: # 只添加非空行
|
|
|
|
|
current_block.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if current_block:
|
|
|
|
|
blocks.append('\n'.join(current_block))
|
|
|
|
|
|
|
|
|
|
return [(i+1, block) for i, block in enumerate(blocks)]
|
|
|
|
|
|
|
|
|
|
def process_document(input_path, output_dir):
|
|
|
|
|
"""处理文档主函数"""
|
|
|
|
|
text = SplitDocxUtil.read_docx(input_path)
|
|
|
|
|
if not text:
|
|
|
|
|
print("无法读取输入文件内容")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 清空目录操作已移到process_directory函数中
|
|
|
|
|
|
|
|
|
|
chunks = split_into_blocks(text)
|
|
|
|
|
print(f"共分割出{len(chunks)}个段落块")
|
|
|
|
|
return [(i + 1, block) for i, block in enumerate(blocks)]
|
|
|
|
|
|
|
|
|
|
saved_count = 0
|
|
|
|
|
# 从输入文件名中提取MATH_1部分
|
|
|
|
|
file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + os.path.basename(input_path).split('.')[0].split('_')[-1]
|
|
|
|
|
|
|
|
|
|
for chunk_num, chunk in chunks:
|
|
|
|
|
chunk = chunk.strip() # 确保去除空白字符
|
|
|
|
|
output_file = os.path.join(output_dir, f"{file_prefix}_{chunk_num}.txt")
|
|
|
|
|
if save_to_txt(chunk, output_file, mode='w'):
|
|
|
|
|
saved_count += 1
|
|
|
|
|
|
|
|
|
|
print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}")
|
|
|
|
|
return saved_count > 0
|
|
|
|
|
|
|
|
|
|
# 保留原有的save_to_txt函数
|
|
|
|
|
def save_to_txt(content, file_path, mode='w'):
|
|
|
|
@ -66,34 +142,44 @@ def save_to_txt(content, file_path, mode='w'):
|
|
|
|
|
print(f"保存文件{file_path}时出错: {str(e)}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def process_directory(input_dir, output_dir):
|
|
|
|
|
"""处理目录下所有docx文件"""
|
|
|
|
|
if not os.path.exists(input_dir):
|
|
|
|
|
print(f"输入目录不存在: {input_dir}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 确保输出目录存在并清空目录(只需一次)
|
|
|
|
|
if os.path.exists(output_dir):
|
|
|
|
|
for file in os.listdir(output_dir):
|
|
|
|
|
os.remove(os.path.join(output_dir, file))
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
docx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.docx')]
|
|
|
|
|
if not docx_files:
|
|
|
|
|
print(f"目录中没有找到docx文件: {input_dir}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
success_count = 0
|
|
|
|
|
for docx_file in docx_files:
|
|
|
|
|
input_path = os.path.join(input_dir, docx_file)
|
|
|
|
|
print(f"正在处理文件: {docx_file}")
|
|
|
|
|
if process_document(input_path, output_dir):
|
|
|
|
|
success_count += 1
|
|
|
|
|
|
|
|
|
|
print(f"处理完成,共处理{success_count}/{len(docx_files)}个文件")
|
|
|
|
|
return success_count > 0
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
input_dir = '../static/Txt'
|
|
|
|
|
output_dir = '../Txt'
|
|
|
|
|
process_directory(input_dir, output_dir)
|
|
|
|
|
word_document_path = "/static/Test/带图的WORD文档_MATH_3.docx"
|
|
|
|
|
txt_output_dir = "/Txt/"
|
|
|
|
|
img_output_dir = "/static/Images/"
|
|
|
|
|
|
|
|
|
|
# 提取图片
|
|
|
|
|
listImage = extract_images_from_docx(word_document_path, img_output_dir)
|
|
|
|
|
# 读取内容
|
|
|
|
|
res = read_word_content(word_document_path)
|
|
|
|
|
# 分块
|
|
|
|
|
chunks = split_into_blocks(res)
|
|
|
|
|
saved_count = 0
|
|
|
|
|
|
|
|
|
|
# 使用原来的正则表达式
|
|
|
|
|
pattern = re.compile(r'【图片\d+】')
|
|
|
|
|
# 初始化图片索引
|
|
|
|
|
img_idx = 0
|
|
|
|
|
|
|
|
|
|
for x in chunks:
|
|
|
|
|
firstLine = x[1].split("\n")[0].strip()
|
|
|
|
|
content = x[1][len(firstLine):].strip()
|
|
|
|
|
|
|
|
|
|
# 使用finditer查找所有匹配项
|
|
|
|
|
# 使用闭包函数替换所有匹配项
|
|
|
|
|
img_idx = [0] # 使用列表实现可变状态
|
|
|
|
|
def replacer(match):
|
|
|
|
|
if img_idx[0] < len(listImage):
|
|
|
|
|
result = f"<img src=\"./static/Images/{listImage[img_idx[0]]}\">"
|
|
|
|
|
img_idx[0] += 1
|
|
|
|
|
return result
|
|
|
|
|
return match.group()
|
|
|
|
|
|
|
|
|
|
content = pattern.sub(replacer, content)
|
|
|
|
|
# 保存文本文件
|
|
|
|
|
output_file = os.path.join(txt_output_dir, f"MATH_3_{x[0]}.txt")
|
|
|
|
|
full_content = f"{firstLine}\n{content}"
|
|
|
|
|
if save_to_txt(full_content, output_file, mode='w'):
|
|
|
|
|
saved_count += 1
|
|
|
|
|
|
|
|
|
|
print(f"处理完成,共保存{saved_count}个文件到目录: {txt_output_dir}")
|
|
|
|
|