Files
dsProject/dsAiTeachingModel/utils/DocxUtil.py
2025-08-14 15:45:08 +08:00

120 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
import os
import subprocess
import uuid
from PIL import Image
import os
from networkx.algorithms.bipartite.centrality import betweenness_centrality
# 在程序开始时添加以下配置
logging.basicConfig(
level=logging.INFO, # 设置日志级别为INFO
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# 或者如果你想更详细地控制日志输出
logger = logging.getLogger('DocxUtil')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
def resize_images_in_directory(directory_path, max_width=640, max_height=480):
"""
遍历目录下所有图片并缩放到指定尺寸
:param directory_path: 图片目录路径
:param max_width: 最大宽度
:param max_height: 最大高度
"""
# 支持的图片格式
valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')
for root, _, files in os.walk(directory_path):
for filename in files:
if filename.lower().endswith(valid_extensions):
file_path = os.path.join(root, filename)
try:
with Image.open(file_path) as img:
# 计算缩放比例
width, height = img.size
ratio = min(max_width / width, max_height / height)
# 如果图片已经小于目标尺寸,则跳过
if ratio >= 1:
continue
# 计算新尺寸并缩放
new_size = (int(width * ratio), int(height * ratio))
resized_img = img.resize(new_size, Image.Resampling.LANCZOS)
# 保存图片(覆盖原文件)
resized_img.save(file_path)
logger.info(f"已缩放: {file_path} -> {new_size}")
except Exception as e:
logger.error(f"处理 {file_path} 时出错: {str(e)}")
def get_docx_content_by_pandoc(docx_file):
# 最后拼接的内容
content = ""
# output_file 设置为临时目录下的uuid.md
file_name = uuid.uuid4().hex
# 将docx_file去掉扩展名
prefix = docx_file.split(".")[0].split("/")[-1]
temp_markdown = os.path.join('./static/markdown/', prefix + '.md')
# 调用pandoc将docx文件转换成markdown
os.mkdir("./static/Images/" + file_name)
subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown,
'--extract-media=./static/Images/' + file_name])
# 遍历目录 './static/Images/'+file_name 下所有的图片缩小于640*480的尺寸上
resize_images_in_directory('./static/Images/' + file_name+'/media')
# 读取然后修改内容,输出到新的文件
img_idx = 0 # 图片索引
with open(temp_markdown, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
# 跳过图片高度描述行
if line.startswith('height=') and (line.endswith('in"}') or line.endswith('in"')):
continue
# height="1.91044072615923in"
# 使用find()方法安全地检查图片模式
is_img = line.find("![](") >= 0 and (
line.find(".png") > 0 or
line.find(".jpg") > 0 or
line.find(".jpeg") > 0
)
if is_img:
# ![](media/image3.png){width="3.1251607611548557in"
# height="3.694634733158355in"}
# ![](../static/Images/01b20e04085e406ea5375791da58a60f/media/image3.png){width="3.1251607611548557in"
pos = line.find(")")
q = line[:pos + 1]
q=q.replace("./static",".")
# Modify by Kalman.CHENG ☆: 增加逻辑对图片路径处理,在(和static之间加上/
left_idx = line.find("(")
static_idx = line.find("static")
if left_idx == -1 or static_idx == -1 or left_idx > static_idx:
print("路径中不包含(+~+static的已知格式")
else:
between_content = q[left_idx+1:static_idx].strip()
if between_content:
q = q[:left_idx+1] + '\\' + q[static_idx:]
else:
q = q[:static_idx] + '\\' + q[static_idx:]
print(f"q3{q}")
#q = q[4:-1]
#q='<img src="'+q+'" alt="我是图片">'
img_idx += 1
content += q + "\n"
else:
content += line.strip().replace("**", "") + "\n"
content=content.replace("\phantom","")
# 将content回写到markdown文件
with open(temp_markdown, 'w', encoding='utf-8') as f:
f.write(content)
# 删除临时文件 output_file
# os.remove(temp_markdown)
return content.replace("\n\n", "\n").replace("\\", "/")