You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

204 lines
7.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re
import warnings
import docx
import os
import shutil
import uuid
import zipfile
from docx import Document
from docx.oxml.ns import nsmap
# 抑制HTTPS相关警告
warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure')
warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host')
def extract_images_from_docx(docx_path, output_folder):
"""
从docx提取图片并记录位置
:param docx_path: Word文档路径
:param output_folder: 图片输出文件夹
:return: 包含图片路径和位置的列表
"""
# 创建一个List<String> 记录每个图片的名称和序号
image_data = []
# 创建临时解压目录
temp_dir = os.path.join(output_folder, "temp_docx")
os.makedirs(temp_dir, exist_ok=True)
# 解压docx文件
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
# 读取主文档关系
with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
rels_content = rels_file.read()
# 加载主文档
doc = Document(docx_path)
img_counter = 1
# 遍历所有段落
for para_idx, paragraph in enumerate(doc.paragraphs):
for run_idx, run in enumerate(paragraph.runs):
# 检查运行中的图形
for element in run._element:
if element.tag.endswith('drawing'):
# 提取图片关系ID
blip = element.find('.//a:blip', namespaces=nsmap)
if blip is not None:
embed_id = blip.get('{%s}embed' % nsmap['r'])
# 从关系文件中获取图片文件名
rel_entry = f'<Relationship Id="{embed_id}"'
if rel_entry in rels_content:
start = rels_content.find(rel_entry)
target_start = rels_content.find('Target="', start) + 8
target_end = rels_content.find('"', target_start)
image_path = rels_content[target_start:target_end]
# 构建图片源路径
src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
if os.path.exists(src_path):
# 创建输出文件名
ext = os.path.splitext(src_path)[1]
# 名称为uuid
fileName = uuid.uuid4().hex
img_name = f"{fileName}{ext}"
image_data.append(img_name)
dest_path = os.path.join(output_folder, img_name)
# 复制图片
shutil.copy(src_path, dest_path)
img_counter += 1
# 清理临时目录
shutil.rmtree(temp_dir)
return image_data
def read_word_content(docx_path):
res = ""
idx = 0
"""遍历Word文档的每个段落输出文字或图片标识"""
try:
doc = docx.Document(docx_path)
for paragraph in doc.paragraphs:
has_image = False
# 检查段落中是否有图片
for run in paragraph.runs:
for element in run._element:
if element.tag.endswith('drawing'):
# 找到图片元素
has_image = True
break
if has_image:
break
if has_image:
idx = idx + 1
res = res + "\n" + "【图片" + str(idx) + ""
elif paragraph.text.strip():
res = res + "\n" + paragraph.text.strip()
return res
except Exception as e:
print(f"处理Word文档时出错: {str(e)}")
def split_into_blocks(text):
"""按行遍历文本,发现'问题X''话题X'时开始分割,只移除前缀但保留整行内容"""
blocks = []
current_block = []
in_block = False
for line in text.splitlines():
if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
if in_block:
blocks.append('\n'.join(current_block))
current_block = []
in_block = True
# 循环移除问题和话题前缀后的数字
while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
if line.startswith(('问题', '话题')):
line = line[2:] if len(line) > 2 else line
elif line and line[0].isdigit():
line = line[1:] if len(line) > 1 else line
line = line.strip()
if in_block and line: # 只添加非空行
current_block.append(line)
if current_block:
blocks.append('\n'.join(current_block))
return [(i + 1, block) for i, block in enumerate(blocks)]
# 保留原有的save_to_txt函数
def save_to_txt(content, file_path, mode='w'):
"""将内容保存到文本文件"""
try:
with open(file_path, mode, encoding='utf-8') as f:
f.write(content)
return True
except Exception as e:
print(f"保存文件{file_path}时出错: {str(e)}")
return False
def process_document(word_document_path,txt_output_dir,img_output_dir):
# 提取图片
listImage = extract_images_from_docx(word_document_path, img_output_dir)
# 读取内容
res = read_word_content(word_document_path)
# 分块
chunks = split_into_blocks(res)
saved_count = 0
# 使用原来的正则表达式
pattern = re.compile(r'【图片\d+】')
# 初始化图片索引
img_idx = 0
for x in chunks:
firstLine = x[1].split("\n")[0].strip()
content = x[1][len(firstLine):].strip()
# 使用finditer查找所有匹配项
# 使用闭包函数替换所有匹配项
img_idx = [0] # 使用列表实现可变状态
def replacer(match):
if img_idx[0] < len(listImage):
result = f"<img src=\"./Images/{listImage[img_idx[0]]}\">"
img_idx[0] += 1
return result
return match.group()
content = pattern.sub(replacer, content)
# 保存文本文件
# 从docx文件名提取学科和编号
docx_name = os.path.basename(word_document_path).split('.')[0]
subject_part = '_'.join(docx_name.split('_')[-2:]) # 获取最后两部分如CHINESE_1
output_file = os.path.join(txt_output_dir, f"{subject_part}_{x[0]}.txt")
full_content = f"{firstLine}\n{content}"
if save_to_txt(full_content, output_file, mode='w'):
saved_count += 1
print(f"处理完成,共保存{saved_count}个文件到目录: {txt_output_dir}")
if __name__ == "__main__":
txt_output_dir = "../Txt/"
img_output_dir = "../static/Images/"
# 遍历static/Txt/下所有的docx
for filename in os.listdir("../static/Txt/"):
print("正在处理文件:" + filename)
# 这里需要文件的全称路径
filename = os.path.join("../static/Txt/", filename)
process_document(filename,txt_output_dir,img_output_dir)