import os import re import shutil import warnings import zipfile from docx import Document from docx.oxml.ns import nsmap from Util import DocxUtil # 抑制HTTPS相关警告 warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure') warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host') def extract_images_from_docx(docx_path, output_folder): """ 从docx提取图片并记录位置 :param docx_path: Word文档路径 :param output_folder: 图片输出文件夹 :return: 包含图片路径和位置的列表 """ # 从docx_path 的名称示例:小学数学教学中的若干问题_MATH_1.docx # 则图片的前缀统一为 MATH_1_?.docx ,其中 ? 为数字,表示图片的序号 # 先获取到前缀 a = docx_path.split("_") prefix = a[1] + "_" + a[2].split(".")[0] # print(f"图片前缀为:{prefix}") # 创建一个List 记录每个图片的名称和序号 image_data = [] # 创建临时解压目录 temp_dir = os.path.join(output_folder, "temp_docx") os.makedirs(temp_dir, exist_ok=True) # 解压docx文件 with zipfile.ZipFile(docx_path, 'r') as zip_ref: zip_ref.extractall(temp_dir) # 读取主文档关系 with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file: rels_content = rels_file.read() # 加载主文档 doc = Document(docx_path) img_counter = 1 # 遍历所有段落 for para_idx, paragraph in enumerate(doc.paragraphs): for run_idx, run in enumerate(paragraph.runs): # 检查运行中的图形 for element in run._element: if element.tag.endswith('drawing'): # 提取图片关系ID blip = element.find('.//a:blip', namespaces=nsmap) if blip is not None: embed_id = blip.get('{%s}embed' % nsmap['r']) # 从关系文件中获取图片文件名 rel_entry = f' 2 else line elif line and line[0].isdigit(): line = line[1:] line = line.strip() if in_block and line: # 只添加非空行 current_block.append(line) if current_block: blocks.append('\n'.join(current_block)) return [(i + 1, block) for i, block in enumerate(blocks)] def save_to_txt(content, file_path, mode='w'): """将内容保存到文本文件""" try: with open(file_path, mode, encoding='utf-8') as f: f.write(content) return True except Exception as e: print(f"保存文件{file_path}时出错: {str(e)}") return False class ImageReplacer: def __init__(self, image_list): self.image_list = image_list self.current_idx = 0 def replace(self, match): if self.current_idx < len(self.image_list): result = f"![](./Images/{self.image_list[self.current_idx]})" self.current_idx += 1 return result return match.group() def process_document(docx_file, txt_output_dir, img_output_dir): # 提取图片 listImage = extract_images_from_docx(docx_file, img_output_dir) print(f"图片数量为:{len(listImage)}") # 读取内容 res = DocxUtil.get_docx_content_by_pandoc(docx_file) # 分块 chunks = split_into_blocks(res) saved_count = 0 # 使用原来的正则表达式 pattern = re.compile(r'【图片\d+】') # 创建图片替换器 replacer = ImageReplacer(listImage) for x in chunks: firstLine = x[1].split("\n")[0].strip() content = x[1][len(firstLine):].strip() # 使用类方法替换图片 content = pattern.sub(replacer.replace, content) # 保存文本文件 # 从docx文件名提取学科和编号 docx_name = os.path.basename(docx_file).split('.')[0] subject_part = '_'.join(docx_name.split('_')[-2:]) # 获取最后两部分如CHINESE_1 output_file = os.path.join(txt_output_dir, f"{subject_part}_{x[0]}.txt") full_content = f"{firstLine}\n{content}" if save_to_txt(full_content, output_file, mode='w'): saved_count += 1 print(f"处理完成,共保存{saved_count}个文件到目录: {txt_output_dir}") if __name__ == "__main__": txt_output_dir = "../Txt/" img_output_dir = "../static/Images/" # 清空上面的两个输出目录,用os进行删除,在Windows环境中进行 if os.path.exists(txt_output_dir): shutil.rmtree(txt_output_dir) if os.path.exists(img_output_dir): shutil.rmtree(img_output_dir) # 创建输出目录 os.makedirs(txt_output_dir, exist_ok=True) os.makedirs(img_output_dir, exist_ok=True) # 遍历static/Txt/下所有的docx for filename in os.listdir("../static/Txt/"): print("正在处理文件:" + filename) # 这里需要文件的全称路径 filename = os.path.join("../static/Txt/", filename) process_document(filename, txt_output_dir, img_output_dir)