|
|
import os
|
|
|
import time
|
|
|
|
|
|
import win32com
|
|
|
from win32com.client import Dispatch
|
|
|
import re
|
|
|
import logging
|
|
|
|
|
|
# pip install pywin32 openpyxl
|
|
|
# pip install pywin32
|
|
|
|
|
|
logging.basicConfig(
|
|
|
level=logging.DEBUG, # 设置日志级别
|
|
|
filename='app.log', # 设置日志文件名
|
|
|
filemode='w', # 文件模式,'w'表示写模式,每次运行都会覆盖旧文件;'a'表示追加模式
|
|
|
format='%(name)s - %(levelname)s - %(message)s' # 设置日志格式
|
|
|
)
|
|
|
|
|
|
working_dir = r"D:\dsWork\YunNanDsBase\Doc\全省及州市县区人口与教育报告集20241023\133个县区报告2022\县区研究报告"
|
|
|
import openpyxl
|
|
|
|
|
|
# 声明Word应用程序
|
|
|
docApp = win32com.client.Dispatch('Word.Application')
|
|
|
docApp.Visible = True
|
|
|
docApp.DisplayAlerts = 0
|
|
|
|
|
|
# 有问题的县区列表
|
|
|
errorArea = []
|
|
|
# 读取ErrorArea.txt,将每一行的县区名称读入列表中
|
|
|
with open('ErrorArea.txt', 'r', encoding='utf-8') as f:
|
|
|
for line in f:
|
|
|
# 去除每行前后的空白字符,包括空格、制表符和换行符
|
|
|
line = line.strip()
|
|
|
# 将文本中的关键字替换为空字符串
|
|
|
errorArea.append(line)
|
|
|
|
|
|
# 在工作目录下创建Excel目录
|
|
|
excel_dir = r'D:\dsWork\YunNanDsBase\Doc\全省及州市县区人口与教育报告集20241023\133个县区报告2022\Excel'
|
|
|
if not os.path.exists(excel_dir):
|
|
|
os.mkdir(excel_dir)
|
|
|
# 遍历working_dir目录下的所有子文件夹
|
|
|
for root, dirs, files in os.walk(working_dir):
|
|
|
for dir in dirs:
|
|
|
# 获取县区名称
|
|
|
county_name = dir
|
|
|
|
|
|
# 获取县区文件夹路径
|
|
|
county_dir = os.path.join(root, dir)
|
|
|
# 遍历县区文件夹下的所有文件
|
|
|
for file in os.listdir(county_dir):
|
|
|
# 获取文件路径
|
|
|
file_path = os.path.join(county_dir, file)
|
|
|
cityName = ""
|
|
|
# 判断文件是否是Word文档
|
|
|
if file_path.endswith('.docx') and not file.startswith('~'):
|
|
|
cityName = file_path.replace(working_dir, '')[1:].split("各县")[0]
|
|
|
areaName = file
|
|
|
|
|
|
areaName = re.sub(r'[^\u4e00-\u9fa5]', '', areaName)
|
|
|
if '市' not in areaName and '县' not in areaName and '区' not in areaName:
|
|
|
continue
|
|
|
# 打开文件文件,按行读取
|
|
|
with open('replaceBlank.txt', 'r', encoding='utf-8') as f:
|
|
|
for line in f:
|
|
|
# 去除每行前后的空白字符,包括空格、制表符和换行符
|
|
|
line = line.strip()
|
|
|
# 将文本中的关键字替换为空字符串
|
|
|
areaName = areaName.replace(line, '')
|
|
|
# 打开文件文件,按行读取
|
|
|
with open('replaceText.txt', 'r', encoding='utf-8') as f:
|
|
|
for line in f:
|
|
|
# 去除每行前后的空白字符,包括空格、制表符和换行符
|
|
|
line = line.strip()
|
|
|
# 将文本中的关键字替换为空字符串
|
|
|
areaName = areaName.replace(line.split(' ')[0], line.split(' ')[1])
|
|
|
# 检查Excel目录下是不是存在这个城市的文件夹,如果不存在,则创建
|
|
|
city_dir = os.path.join(excel_dir, cityName)
|
|
|
if not os.path.exists(city_dir):
|
|
|
os.mkdir(city_dir)
|
|
|
|
|
|
# 在城市文件夹下,查看是不是存在县区的子文件夹,如果不存在则创建
|
|
|
county_sub_dir = os.path.join(city_dir, areaName)
|
|
|
if not os.path.exists(county_sub_dir):
|
|
|
os.mkdir(county_sub_dir)
|
|
|
else: # 如果存在,就跳过
|
|
|
# 查看一下这个文件夹下有多少个文件
|
|
|
file_count = len([name for name in os.listdir(county_sub_dir)])
|
|
|
if file_count > 10:
|
|
|
print(county_sub_dir + " 文件夹下有超过10个文件,跳过")
|
|
|
continue
|
|
|
# 跳过错误县区
|
|
|
flag = False
|
|
|
for e in errorArea:
|
|
|
if e in areaName:
|
|
|
flag = True
|
|
|
if flag:
|
|
|
print(county_sub_dir + " 跳过")
|
|
|
continue
|
|
|
print("正在处理" + cityName + "-" + areaName + "...")
|
|
|
# 使用word读取图表的技术,保存EXCEL文件到城市的目录下
|
|
|
# 休息3秒,防止WORD打开频繁造成错误
|
|
|
time.sleep(3)
|
|
|
doc = docApp.Documents.Open(file_path)
|
|
|
# 遍历文档中所有的文字段落,判断是不是以 图+数字开头
|
|
|
idx = 1
|
|
|
# 图表的名称列表
|
|
|
tb_list = []
|
|
|
for para in doc.Paragraphs:
|
|
|
x = para.Range.Text.strip().replace("图 ", "图").replace(" ", " ")
|
|
|
if x.startswith("图"):
|
|
|
tb_list.append(x)
|
|
|
idx = idx + 1
|
|
|
|
|
|
# 遍历文档中的所有内嵌形状
|
|
|
idx = 1
|
|
|
for inline_shape in doc.InlineShapes:
|
|
|
if inline_shape.Type == win32com.client.constants.wdInlineShapeChart:
|
|
|
shape = doc.InlineShapes(idx)
|
|
|
sheet = shape.Chart.ChartData.Workbook.Worksheets(1)
|
|
|
# 创建一个新的Excel工作簿
|
|
|
wb = openpyxl.Workbook()
|
|
|
ws = wb.active
|
|
|
|
|
|
# 遍历Excel工作表中的所有单元格,并将其写入新的工作簿
|
|
|
for row in range(1, sheet.UsedRange.Rows.Count + 1):
|
|
|
for col in range(1, sheet.UsedRange.Columns.Count + 1):
|
|
|
cell_value = sheet.Cells(row, col).Value
|
|
|
ws.cell(row=row, column=col, value=cell_value)
|
|
|
# 保存新的Excel文件
|
|
|
try:
|
|
|
original_string = tb_list[idx - 1]
|
|
|
# 使用正则表达式过滤,只保留中文、英文和数字
|
|
|
original_string = original_string[1:]
|
|
|
if ' ' in original_string:
|
|
|
original_string = original_string.split(" ")[1]
|
|
|
filtered_string = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', original_string)
|
|
|
fileName = '【' + str(idx) + '】' + filtered_string + ".xlsx"
|
|
|
|
|
|
wb.save(county_sub_dir + '/' + fileName)
|
|
|
except:
|
|
|
pass
|
|
|
wb.close()
|
|
|
print("保存文件:" + fileName)
|
|
|
# 下一个图表的索引号
|
|
|
idx = idx + 1
|
|
|
# 关闭文档和Word应用
|
|
|
doc.Close()
|
|
|
print(f"县区处理完成:{cityName}{areaName}")
|
|
|
docApp.Quit()
|
|
|
print("恭喜,所有县区数据整理工作成功完成!")
|