baseService/ETL/Mars/Utils/OfficeUtil.py

from docx.shared import Cm
from pptx.enum.shapes import MSO_SHAPE_TYPE
import hashlib
from docx import Document
from pptx import Presentation


def WordReplaceText(doc, tag, pv):
    for paragraph in doc.paragraphs:
        if tag not in paragraph.text:
            continue
        tmp = ''
        runs = paragraph.runs
        for i, run in enumerate(runs):
            tmp += run.text  # 合并run字符串
            if tag in tmp:
                # 如果存在匹配得字符串，那么将当前得run替换成合并后得字符串
                run.text = run.text.replace(run.text, tmp)
                run.text = run.text.replace(tag, pv)
                tmp = ''
            else:
                # 如果没匹配到目标字符串则把当前run置空
                run.text = run.text.replace(run.text, '')
            if i == len(runs) - 1:
                # 如果是当前段落一直没有符合规则得字符串直接将当前run替换为tmp
                run.add_text(tmp)


def WordReplacePic(doc, tag, pic, width=14.63):
    for paragraph in doc.paragraphs:
        if tag in paragraph.text:
            has_replaced = False
            for run in paragraph.runs:
                run.clear()
                if not has_replaced:
                    run.add_picture(pic, width=Cm(width))
                    has_replaced = True


def WordReplaceTextInTable(doc, tag, pv):
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                # 如果只是为了内容，直接替换cell.text,但是为了保存原有格式，需要将每个单元格的文本当作一段看待，以此提取出run来不修改原格式
                for paragraph in cell.paragraphs:
                    if tag in paragraph.text:
                        has_replaced = False
                        for run in paragraph.runs:
                            run.clear()
                            if not has_replaced:
                                run.add_text(pv)
                                has_replaced = True


def PptReplaceText(prs, search_str, repl_str):
    for x in range(len(prs.slides)):
        for shape in prs.slides[x].shapes:
            if hasattr(shape, "text"):
                if (shape.text.find(search_str)) != -1:
                    text_frame = shape.text_frame
                    cur_texts = text_frame.paragraphs[0].runs
                    for index in range(len(cur_texts)):
                        if (cur_texts[index].text.find(search_str)) != -1:
                            cur_text = text_frame.paragraphs[0].runs[index].text
                            new_text = cur_text.replace(str(search_str), str(repl_str))
                            text_frame.paragraphs[0].runs[index].text = new_text

        group_shapes = [shp for shp in prs.slides[x].shapes
                        if shp.shape_type == MSO_SHAPE_TYPE.GROUP]
        for group_shape in group_shapes:
            for shape in group_shape.shapes:
                if shape.has_text_frame:
                    if (shape.text.find(search_str)) != -1:
                        text_frame = shape.text_frame
                        for index in range(len(text_frame.paragraphs)):
                            cur_text = text_frame.paragraphs[index].text
                            if (cur_text.find(search_str)) != -1:
                                new_text = cur_text.replace(str(search_str), str(repl_str))
                                text_frame.paragraphs[index].text = new_text


def PptReplacePic(prs, newpic, oldpic):
    # 把旧样本图片Logo,获取指纹
    imageFile = open(oldpic, "rb")
    imgBlob = imageFile.read()
    md5finger = hashlib.md5(imgBlob).hexdigest()
    for slide in list(prs.slides)[0:]:
        for shape in list(slide.shapes):
            ispicture = False
            try:
                md5img = hashlib.md5(shape.image.blob).hexdigest()
                ispicture = True
            except:
                pass
            e = shape.element
            if ispicture:
                if md5img == md5finger:
                    slide.shapes.add_picture(newpic, shape.left, shape.top, shape.width, shape.height)
                    e.getparent().remove(e)
                pass


def ReplaceTxtInTable(ppt, oldStr, newStr):
    #  所有幻灯片
    for i in range(0, len(ppt.slides)):
        placeholder = ppt.slides[i].shapes
        for j in range(len(placeholder)):
            if placeholder[j].has_table:  # 所有表格
                for row in range(len(placeholder[j].table.rows)):
                    for col in range(len(placeholder[j].table.columns)):
                        if placeholder[j].table.cell(row, col).text == oldStr:
                            placeholder[j].table.cell(row, col).text = newStr