You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

192 lines
7.3 KiB

4 weeks ago
import os
3 weeks ago
import re
4 weeks ago
import shutil
3 weeks ago
import warnings
4 weeks ago
import zipfile
from docx import Document
from docx.oxml.ns import nsmap
3 weeks ago
from Util import DocxUtil
4 weeks ago
# 抑制HTTPS相关警告
warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure')
warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host')
4 weeks ago
def extract_images_from_docx(docx_path, output_folder):
"""
从docx提取图片并记录位置
:param docx_path: Word文档路径
:param output_folder: 图片输出文件夹
:return: 包含图片路径和位置的列表
"""
4 weeks ago
# 从docx_path 的名称示例小学数学教学中的若干问题_MATH_1.docx
# 则图片的前缀统一为 MATH_1_?.docx ,其中 ? 为数字,表示图片的序号
# 先获取到前缀
a = docx_path.split("_")
prefix = a[1] + "_" + a[2].split(".")[0]
# print(f"图片前缀为:{prefix}")
4 weeks ago
# 创建一个List<String> 记录每个图片的名称和序号
image_data = []
# 创建临时解压目录
temp_dir = os.path.join(output_folder, "temp_docx")
os.makedirs(temp_dir, exist_ok=True)
# 解压docx文件
with zipfile.ZipFile(docx_path, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
# 读取主文档关系
with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
rels_content = rels_file.read()
# 加载主文档
doc = Document(docx_path)
img_counter = 1
# 遍历所有段落
for para_idx, paragraph in enumerate(doc.paragraphs):
for run_idx, run in enumerate(paragraph.runs):
# 检查运行中的图形
for element in run._element:
if element.tag.endswith('drawing'):
# 提取图片关系ID
blip = element.find('.//a:blip', namespaces=nsmap)
if blip is not None:
embed_id = blip.get('{%s}embed' % nsmap['r'])
# 从关系文件中获取图片文件名
rel_entry = f'<Relationship Id="{embed_id}"'
if rel_entry in rels_content:
start = rels_content.find(rel_entry)
target_start = rels_content.find('Target="', start) + 8
target_end = rels_content.find('"', target_start)
image_path = rels_content[target_start:target_end]
# 构建图片源路径
src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
if os.path.exists(src_path):
# 创建输出文件名
ext = os.path.splitext(src_path)[1]
3 weeks ago
fileName = prefix + "_" + str(img_counter)
4 weeks ago
img_name = f"{fileName}{ext}"
image_data.append(img_name)
dest_path = os.path.join(output_folder, img_name)
# 复制图片
shutil.copy(src_path, dest_path)
img_counter += 1
# 清理临时目录
shutil.rmtree(temp_dir)
return image_data
4 weeks ago
4 weeks ago
def split_into_blocks(text):
"""按行遍历文本,发现'问题X''话题X'时开始分割,只移除前缀但保留整行内容"""
blocks = []
current_block = []
in_block = False
4 weeks ago
4 weeks ago
for line in text.splitlines():
if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
if in_block:
blocks.append('\n'.join(current_block))
current_block = []
in_block = True
# 循环移除问题和话题前缀后的数字
3 weeks ago
if line and line.startswith(('问题', '话题')):
while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
if line.startswith(('问题', '话题')):
line = line[2:] if len(line) > 2 else line
elif line and line[0].isdigit():
line = line[1:]
line = line.strip()
4 weeks ago
if in_block and line: # 只添加非空行
current_block.append(line)
4 weeks ago
4 weeks ago
if current_block:
blocks.append('\n'.join(current_block))
4 weeks ago
return [(i + 1, block) for i, block in enumerate(blocks)]
4 weeks ago
def save_to_txt(content, file_path, mode='w'):
"""将内容保存到文本文件"""
try:
with open(file_path, mode, encoding='utf-8') as f:
f.write(content)
return True
except Exception as e:
print(f"保存文件{file_path}时出错: {str(e)}")
return False
4 weeks ago
class ImageReplacer:
def __init__(self, image_list):
self.image_list = image_list
self.current_idx = 0
3 weeks ago
4 weeks ago
def replace(self, match):
if self.current_idx < len(self.image_list):
4 weeks ago
result = f"![](./Images/{self.image_list[self.current_idx]})"
4 weeks ago
self.current_idx += 1
return result
return match.group()
3 weeks ago
3 weeks ago
def process_document(docx_file, txt_output_dir, img_output_dir):
4 weeks ago
# 提取图片
3 weeks ago
listImage = extract_images_from_docx(docx_file, img_output_dir)
3 weeks ago
print(f"图片数量为:{len(listImage)}")
4 weeks ago
# 读取内容
3 weeks ago
res = DocxUtil.get_docx_content_by_pandoc(docx_file)
4 weeks ago
# 分块
chunks = split_into_blocks(res)
saved_count = 0
# 使用原来的正则表达式
pattern = re.compile(r'【图片\d+】')
4 weeks ago
# 创建图片替换器
replacer = ImageReplacer(listImage)
4 weeks ago
for x in chunks:
firstLine = x[1].split("\n")[0].strip()
content = x[1][len(firstLine):].strip()
3 weeks ago
4 weeks ago
# 使用类方法替换图片
content = pattern.sub(replacer.replace, content)
4 weeks ago
# 保存文本文件
4 weeks ago
# 从docx文件名提取学科和编号
3 weeks ago
docx_name = os.path.basename(docx_file).split('.')[0]
4 weeks ago
subject_part = '_'.join(docx_name.split('_')[-2:]) # 获取最后两部分如CHINESE_1
output_file = os.path.join(txt_output_dir, f"{subject_part}_{x[0]}.txt")
4 weeks ago
full_content = f"{firstLine}\n{content}"
if save_to_txt(full_content, output_file, mode='w'):
saved_count += 1
print(f"处理完成,共保存{saved_count}个文件到目录: {txt_output_dir}")
4 weeks ago
if __name__ == "__main__":
txt_output_dir = "../Txt/"
img_output_dir = "../static/Images/"
3 weeks ago
# 清空上面的两个输出目录,用os进行删除,在Windows环境中进行
if os.path.exists(txt_output_dir):
shutil.rmtree(txt_output_dir)
if os.path.exists(img_output_dir):
shutil.rmtree(img_output_dir)
# 创建输出目录
os.makedirs(txt_output_dir, exist_ok=True)
os.makedirs(img_output_dir, exist_ok=True)
4 weeks ago
# 遍历static/Txt/下所有的docx
for filename in os.listdir("../static/Txt/"):
print("正在处理文件:" + filename)
# 这里需要文件的全称路径
filename = os.path.join("../static/Txt/", filename)
4 weeks ago
process_document(filename, txt_output_dir, img_output_dir)