You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

130 lines
4.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import logging
import os
import subprocess
import uuid
from PIL import Image
import os
# 或者如果你想更详细地控制日志输出
logger = logging.getLogger('DocxUtil')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
def resize_images_in_directory(directory_path, max_width=640, max_height=480):
"""
遍历目录下所有图片并缩放到指定尺寸
:param directory_path: 图片目录路径
:param max_width: 最大宽度
:param max_height: 最大高度
"""
# 支持的图片格式
valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')
for root, _, files in os.walk(directory_path):
for filename in files:
if filename.lower().endswith(valid_extensions):
file_path = os.path.join(root, filename)
try:
with Image.open(file_path) as img:
# 计算缩放比例
width, height = img.size
ratio = min(max_width / width, max_height / height)
# 如果图片已经小于目标尺寸,则跳过
if ratio >= 1:
continue
# 计算新尺寸并缩放
new_size = (int(width * ratio), int(height * ratio))
resized_img = img.resize(new_size, Image.Resampling.LANCZOS)
# 保存图片(覆盖原文件)
resized_img.save(file_path)
logger.info(f"已缩放: {file_path} -> {new_size}")
except Exception as e:
logger.error(f"处理 {file_path} 时出错: {str(e)}")
import hashlib
def calculate_docx_md5(docx_file_path):
"""
计算docx文件的MD5哈希值
:param docx_file_path: docx文件路径
:return: MD5哈希字符串
"""
# 以二进制模式读取文件
with open(docx_file_path, 'rb') as f:
file_content = f.read()
# 创建MD5哈希对象
md5_hash = hashlib.md5()
# 更新哈希值
md5_hash.update(file_content)
# 返回16进制格式的哈希值
return md5_hash.hexdigest()
def get_docx_content_by_pandoc(docx_file):
# 最后拼接的内容
content = ""
# 计算 docx_file 的字符串md5值
md5_value = calculate_docx_md5(docx_file)
# 将docx_file去掉扩展名
prefix = docx_file.split(".")[0].split("/")[-1]
temp_markdown = os.path.join('./static/markdown/', prefix + '.md')
# 调用pandoc将docx文件转换成markdown
path = "./static/Images/" + md5_value
if not os.path.exists(path):
os.mkdir("./static/Images/" + md5_value)
subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown,
'--extract-media=./static/Images/' + md5_value])
# 遍历目录 './static/Images/'+file_name 下所有的图片缩小于640*480的尺寸上
resize_images_in_directory('./static/Images/' + md5_value + '/media')
# 读取然后修改内容,输出到新的文件
img_idx = 0 # 图片索引
with open(temp_markdown, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
# 跳过图片高度描述行
if line.startswith('height=') and (line.endswith('in"}') or line.endswith('in"')):
continue
# height="1.91044072615923in"
# 使用find()方法安全地检查图片模式
is_img = line.find("![](") >= 0 and (
line.find(".png") > 0 or
line.find(".jpg") > 0 or
line.find(".jpeg") > 0
)
if is_img:
# ![](media/image3.png){width="3.1251607611548557in"
# height="3.694634733158355in"}
# ![](../static/Images/01b20e04085e406ea5375791da58a60f/media/image3.png){width="3.1251607611548557in"
pos = line.find(")")
q = line[:pos + 1]
q = q.replace("./static", ".")
# q = q[4:-1]
# q='<img src="'+q+'" alt="我是图片">'
img_idx += 1
content += q + "\n"
else:
content += line.strip().replace("**", "") + "\n"
content = content.replace("\phantom", "")
# 将content回写到markdown文件
with open(temp_markdown, 'w', encoding='utf-8') as f:
f.write(content)
# 删除临时文件 output_file
# os.remove(temp_markdown)
return content.replace("\n\n", "\n").replace("\\", "")