|
|
|
@ -0,0 +1,51 @@
|
|
|
|
|
# pip install pywin32
|
|
|
|
|
# https://blog.csdn.net/weixin_42927998/article/details/115086797
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
import win32com
|
|
|
|
|
from win32com.client import Dispatch
|
|
|
|
|
|
|
|
|
|
# 工作目录
|
|
|
|
|
workingPath = r'D:\dsWork\YunNanDsBase\Doc\全省及州市县区人口与教育报告集20241023\16个州市报告2022\分析报告20240510'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 修复Word文档
|
|
|
|
|
# 经过反复测试发现,WORD文档中的图表,有些POI是无法正确读取的,本来是Sheet1,结果它不认识,说只有一个Sheet0,此时就无法正确读取数据了。
|
|
|
|
|
# 而我通过python+win32com.client.Dispatch可以读取到,直接保存,就修复了这个BUG,真是太神奇了!
|
|
|
|
|
def repairWord(docPath):
|
|
|
|
|
docApp = win32com.client.Dispatch('Word.Application')
|
|
|
|
|
# 是不是打Word显示
|
|
|
|
|
docApp.Visible = False
|
|
|
|
|
docApp.DisplayAlerts = 0
|
|
|
|
|
|
|
|
|
|
doc = docApp.Documents.Open(docPath)
|
|
|
|
|
#
|
|
|
|
|
# # 遍历文档中的所有内嵌形状
|
|
|
|
|
idx = 1
|
|
|
|
|
for inline_shape in doc.InlineShapes:
|
|
|
|
|
if inline_shape.Type == win32com.client.constants.wdInlineShapeChart: # 检查是否为内嵌图表
|
|
|
|
|
shape = doc.InlineShapes(idx)
|
|
|
|
|
sheet = shape.Chart.ChartData.Workbook.Worksheets("Sheet1")
|
|
|
|
|
# 下一个图表的索引号
|
|
|
|
|
idx = idx + 1
|
|
|
|
|
|
|
|
|
|
# 关闭文档和Word应用
|
|
|
|
|
doc.Close()
|
|
|
|
|
docApp.Quit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
# 遍历工作目录下所有的docx
|
|
|
|
|
for file in os.listdir(workingPath):
|
|
|
|
|
if file.endswith('.docx'):
|
|
|
|
|
# 完整的路径名称
|
|
|
|
|
docPath = os.path.join(workingPath, file)
|
|
|
|
|
# 如果完整的文件名是以.docx.docx结尾的,就修改为.docx结尾
|
|
|
|
|
if docPath.endswith('.docx.docx'):
|
|
|
|
|
docPath = docPath.replace('.docx.docx', '.docx')
|
|
|
|
|
print("文件名有误,已修复:" + docPath)
|
|
|
|
|
os.rename(docPath, docPath.replace('.docx.docx', '.docx'))
|
|
|
|
|
print("正在修复文档:" + file)
|
|
|
|
|
# 开始修复文档
|
|
|
|
|
repairWord(docPath)
|
|
|
|
|
print("修复完成")
|