You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

192 lines
7.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os
import re
import shutil
import warnings
import zipfile
from docx import Document
from docx.oxml.ns import nsmap
from Util import DocxUtil
# 抑制HTTPS相关警告
warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure')
warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host')
def extract_images_from_docx(docx_path, output_folder):
"""
从docx提取图片并记录位置
:param docx_path: Word文档路径
:param output_folder: 图片输出文件夹
:return: 包含图片路径和位置的列表
"""
# 从docx_path 的名称示例小学数学教学中的若干问题_MATH_1.docx
# 则图片的前缀统一为 MATH_1_?.docx ,其中 ? 为数字,表示图片的序号
# 先获取到前缀
a = docx_path.split("_")
prefix = a[1] + "_" + a[2].split(".")[0]
# print(f"图片前缀为:{prefix}")
# 创建一个List<String> 记录每个图片的名称和序号
image_data = []
# 创建临时解压目录
temp_dir = os.path.join(output_folder, "temp_docx")
os.makedirs(temp_dir, exist_ok=True)
# 解压docx文件
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
# 读取主文档关系
with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
rels_content = rels_file.read()
# 加载主文档
doc = Document(docx_path)
img_counter = 1
# 遍历所有段落
for para_idx, paragraph in enumerate(doc.paragraphs):
for run_idx, run in enumerate(paragraph.runs):
# 检查运行中的图形
for element in run._element:
if element.tag.endswith('drawing'):
# 提取图片关系ID
blip = element.find('.//a:blip', namespaces=nsmap)
if blip is not None:
embed_id = blip.get('{%s}embed' % nsmap['r'])
# 从关系文件中获取图片文件名
rel_entry = f'<Relationship Id="{embed_id}"'
if rel_entry in rels_content:
start = rels_content.find(rel_entry)
target_start = rels_content.find('Target="', start) + 8
target_end = rels_content.find('"', target_start)
image_path = rels_content[target_start:target_end]
# 构建图片源路径
src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
if os.path.exists(src_path):
# 创建输出文件名
ext = os.path.splitext(src_path)[1]
fileName = prefix + "_" + str(img_counter)
img_name = f"{fileName}{ext}"
image_data.append(img_name)
dest_path = os.path.join(output_folder, img_name)
# 复制图片
shutil.copy(src_path, dest_path)
img_counter += 1
# 清理临时目录
shutil.rmtree(temp_dir)
return image_data
def split_into_blocks(text):
"""按行遍历文本,发现'问题X''话题X'时开始分割,只移除前缀但保留整行内容"""
blocks = []
current_block = []
in_block = False
for line in text.splitlines():
if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
if in_block:
blocks.append('\n'.join(current_block))
current_block = []
in_block = True
# 循环移除问题和话题前缀后的数字
if line and line.startswith(('问题', '话题')):
while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
if line.startswith(('问题', '话题')):
line = line[2:] if len(line) > 2 else line
elif line and line[0].isdigit():
line = line[1:]
line = line.strip()
if in_block and line: # 只添加非空行
current_block.append(line)
if current_block:
blocks.append('\n'.join(current_block))
return [(i + 1, block) for i, block in enumerate(blocks)]
def save_to_txt(content, file_path, mode='w'):
"""将内容保存到文本文件"""
try:
with open(file_path, mode, encoding='utf-8') as f:
f.write(content)
return True
except Exception as e:
print(f"保存文件{file_path}时出错: {str(e)}")
return False
class ImageReplacer:
def __init__(self, image_list):
self.image_list = image_list
self.current_idx = 0
def replace(self, match):
if self.current_idx < len(self.image_list):
result = f"![](./Images/{self.image_list[self.current_idx]})"
self.current_idx += 1
return result
return match.group()
def process_document(docx_file, txt_output_dir, img_output_dir):
# 提取图片
listImage = extract_images_from_docx(docx_file, img_output_dir)
print(f"图片数量为:{len(listImage)}")
# 读取内容
res = DocxUtil.get_docx_content_by_pandoc(docx_file)
# 分块
chunks = split_into_blocks(res)
saved_count = 0
# 使用原来的正则表达式
pattern = re.compile(r'【图片\d+】')
# 创建图片替换器
replacer = ImageReplacer(listImage)
for x in chunks:
firstLine = x[1].split("\n")[0].strip()
content = x[1][len(firstLine):].strip()
# 使用类方法替换图片
content = pattern.sub(replacer.replace, content)
# 保存文本文件
# 从docx文件名提取学科和编号
docx_name = os.path.basename(docx_file).split('.')[0]
subject_part = '_'.join(docx_name.split('_')[-2:]) # 获取最后两部分如CHINESE_1
output_file = os.path.join(txt_output_dir, f"{subject_part}_{x[0]}.txt")
full_content = f"{firstLine}\n{content}"
if save_to_txt(full_content, output_file, mode='w'):
saved_count += 1
print(f"处理完成,共保存{saved_count}个文件到目录: {txt_output_dir}")
if __name__ == "__main__":
txt_output_dir = "../Txt/"
img_output_dir = "../static/Images/"
# 清空上面的两个输出目录,用os进行删除,在Windows环境中进行
if os.path.exists(txt_output_dir):
shutil.rmtree(txt_output_dir)
if os.path.exists(img_output_dir):
shutil.rmtree(img_output_dir)
# 创建输出目录
os.makedirs(txt_output_dir, exist_ok=True)
os.makedirs(img_output_dir, exist_ok=True)
# 遍历static/Txt/下所有的docx
for filename in os.listdir("../static/Txt/"):
print("正在处理文件:" + filename)
# 这里需要文件的全称路径
filename = os.path.join("../static/Txt/", filename)
process_document(filename, txt_output_dir, img_output_dir)