You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

129 lines
6.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os
import win32com
from win32com.client import Dispatch
import re
import logging
# pip install pywin32 openpyxl
# pip install pywin32
logging.basicConfig(
level=logging.DEBUG, # 设置日志级别
filename='app.log', # 设置日志文件名
filemode='w', # 文件模式,'w'表示写模式,每次运行都会覆盖旧文件;'a'表示追加模式
format='%(name)s - %(levelname)s - %(message)s' # 设置日志格式
)
working_dir = r"D:\dsWork\YunNanDsBase\Doc\全省及州市县区人口与教育报告集20241023\133个县区报告2022\县区研究报告"
import openpyxl
# 是不是打Word显示
docApp = win32com.client.Dispatch('Word.Application')
# 是不是打Word显示
docApp.Visible = False
docApp.DisplayAlerts = 0
# 在工作目录下创建Excel目录
excel_dir = r'D:\dsWork\YunNanDsBase\Doc\全省及州市县区人口与教育报告集20241023\133个县区报告2022\Excel'
if not os.path.exists(excel_dir):
os.mkdir(excel_dir)
# 遍历working_dir目录下的所有子文件夹
for root, dirs, files in os.walk(working_dir):
for dir in dirs:
# 获取县区名称
county_name = dir
# 获取县区文件夹路径
county_dir = os.path.join(root, dir)
# 遍历县区文件夹下的所有文件
for file in os.listdir(county_dir):
# 获取文件路径
file_path = os.path.join(county_dir, file)
# 判断文件是否是Word文档
if file_path.endswith('.docx') and not file.startswith('~'):
cityName = file_path.replace(working_dir, '')[1:].split("各县")[0]
areaName = file
areaName = re.sub(r'[^\u4e00-\u9fa5]', '', areaName)
if '' not in areaName and '' not in areaName and '' not in areaName:
continue
# 打开文件文件,按行读取
with open('replaceBlank.txt', 'r', encoding='utf-8') as f:
for line in f:
# 去除每行前后的空白字符,包括空格、制表符和换行符
line = line.strip()
# 将文本中的关键字替换为空字符串
areaName = areaName.replace(line, '')
# 打开文件文件,按行读取
with open('replaceText.txt', 'r', encoding='utf-8') as f:
for line in f:
# 去除每行前后的空白字符,包括空格、制表符和换行符
line = line.strip()
# 将文本中的关键字替换为空字符串
areaName = areaName.replace(line.split(' ')[0], line.split(' ')[1])
# 检查Excel目录下是不是存在这个城市的文件夹如果不存在则创建
city_dir = os.path.join(excel_dir, cityName)
if not os.path.exists(city_dir):
os.mkdir(city_dir)
# 在城市文件夹下,查看是不是存在县区的子文件夹,如果不存在则创建
county_sub_dir = os.path.join(city_dir, areaName)
if not os.path.exists(county_sub_dir):
os.mkdir(county_sub_dir)
else: # 如果存在,就跳过
print(areaName + "已经存在,跳过")
continue
# 使用word读取图表的技术保存EXCEL文件到城市的目录下
doc = docApp.Documents.Open(file_path)
# 遍历文档中所有的文字段落,判断是不是以 图+数字开头
idx = 1
# 图表的名称列表
tb_list = []
for para in doc.Paragraphs:
x = para.Range.Text.strip().replace("", "").replace(" ", " ")
if x.startswith(""):
tb_list.append(x)
idx = idx + 1
# 遍历文档中的所有内嵌形状
idx = 1
for inline_shape in doc.InlineShapes:
if inline_shape.Type == win32com.client.constants.wdInlineShapeChart: # 检查是否为内嵌图表
try:
shape = doc.InlineShapes(idx)
sheet = shape.Chart.ChartData.Workbook.Worksheets(1)
# 创建一个新的Excel工作簿
wb = openpyxl.Workbook()
ws = wb.active
# 遍历Excel工作表中的所有单元格并将其写入新的工作簿
for row in range(1, sheet.UsedRange.Rows.Count + 1):
for col in range(1, sheet.UsedRange.Columns.Count + 1):
cell_value = sheet.Cells(row, col).Value
ws.cell(row=row, column=col, value=cell_value)
# 保存新的Excel文件
try:
original_string = tb_list[idx - 1]
except:
continue
# 使用正则表达式过滤,只保留中文、英文和数字
original_string = original_string[1:]
if ' ' in original_string:
original_string = original_string.split(" ")[1]
filtered_string = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', original_string)
fileName = '' + str(idx) + '' + filtered_string + ".xlsx"
wb.save(county_sub_dir + '/' + fileName)
wb.close()
print("保存文件:" + fileName)
# 下一个图表的索引号
idx = idx + 1
except Exception as err:
# (电脑杂事) WSP版的word中图表链接失效的一种解决方案
# https://blog.csdn.net/Cornergrass/article/details/129960822
logging.error(cityName + " " + areaName + "发生异常:" + str(err))
pass
# 关闭文档和Word应用
doc.Close()
print(f"县区处理完成:{cityName}{areaName}")
docApp.Quit()
print("恭喜,所有县区数据整理工作成功完成!")