Files
YunNanProject/Tools/T5_ZhaoShengCount.py
2025-09-10 16:14:35 +08:00

238 lines
9.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import traceback
from typing import List, Dict, Any, Tuple
import openpyxl
from Config.Config import EXCEL_PATH
from Util.DataUtil import (
init_directories, process_value, print_conversion_stats,
convert_area_name, save_to_json, load_workbook_sheet
)
# ======================== 配置常量 ========================
# 数据目录和JSON路径
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'Data')
JSON_PATH = os.path.join(DATA_DIR, 'ZhaoShengCount.json')
# 工作表名称
SHEET_NAME = '招生数'
# 起始行跳过前3行表头
START_ROW = 4
# 年份范围
YEAR_RANGE = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
# 教育阶段配置
EDUCATION_STAGES = [
{
'name': 'preschool',
'chinese_name': '学前教育',
'categories': ['urban', 'town', 'rural', 'total'],
'columns': [
{'year': 2015, 'urban': 'D', 'town': 'E', 'rural': 'F', 'total': 'G'},
{'year': 2016, 'urban': 'H', 'town': 'I', 'rural': 'J', 'total': 'K'},
{'year': 2017, 'urban': 'L', 'town': 'M', 'rural': 'N', 'total': 'O'},
{'year': 2018, 'urban': 'P', 'town': 'Q', 'rural': 'R', 'total': 'S'},
{'year': 2019, 'urban': 'T', 'town': 'U', 'rural': 'V', 'total': 'W'},
{'year': 2020, 'urban': 'X', 'town': 'Y', 'rural': 'Z', 'total': 'AA'},
{'year': 2021, 'urban': 'AB', 'town': 'AC', 'rural': 'AD', 'total': 'AE'},
{'year': 2022, 'urban': 'AF', 'town': 'AG', 'rural': 'AH', 'total': 'AI'},
{'year': 2023, 'urban': 'AJ', 'town': 'AK', 'rural': 'AL', 'total': 'AM'},
{'year': 2024, 'urban': 'AN', 'town': 'AO', 'rural': 'AP', 'total': 'AQ'}
]
},
{
'name': 'primary',
'chinese_name': '小学教育',
'categories': ['urban', 'town', 'rural', 'total'],
'columns': [
{'year': 2015, 'urban': 'AR', 'town': 'AS', 'rural': 'AT', 'total': 'AU'},
{'year': 2016, 'urban': 'AV', 'town': 'AW', 'rural': 'AX', 'total': 'AY'},
{'year': 2017, 'urban': 'AZ', 'town': 'BA', 'rural': 'BB', 'total': 'BC'},
{'year': 2018, 'urban': 'BD', 'town': 'BE', 'rural': 'BF', 'total': 'BG'},
{'year': 2019, 'urban': 'BH', 'town': 'BI', 'rural': 'BJ', 'total': 'BK'},
{'year': 2020, 'urban': 'BL', 'town': 'BM', 'rural': 'BN', 'total': 'BO'},
{'year': 2021, 'urban': 'BP', 'town': 'BQ', 'rural': 'BR', 'total': 'BS'},
{'year': 2022, 'urban': 'BT', 'town': 'BU', 'rural': 'BV', 'total': 'BW'},
{'year': 2023, 'urban': 'BX', 'town': 'BY', 'rural': 'BZ', 'total': 'CA'},
{'year': 2024, 'urban': 'CB', 'town': 'CC', 'rural': 'CD', 'total': 'CE'}
]
},
{
'name': 'junior',
'chinese_name': '初中教育',
'categories': ['urban', 'town', 'rural', 'total'],
'columns': [
{'year': 2015, 'urban': 'CF', 'town': 'CG', 'rural': 'CH', 'total': 'CI'},
{'year': 2016, 'urban': 'CJ', 'town': 'CK', 'rural': 'CL', 'total': 'CM'},
{'year': 2017, 'urban': 'CN', 'town': 'CO', 'rural': 'CP', 'total': 'CQ'},
{'year': 2018, 'urban': 'CR', 'town': 'CS', 'rural': 'CT', 'total': 'CU'},
{'year': 2019, 'urban': 'CV', 'town': 'CW', 'rural': 'CX', 'total': 'CY'},
{'year': 2020, 'urban': 'CZ', 'town': 'DA', 'rural': 'DB', 'total': 'DC'},
{'year': 2021, 'urban': 'DD', 'town': 'DE', 'rural': 'DF', 'total': 'DG'},
{'year': 2022, 'urban': 'DH', 'town': 'DI', 'rural': 'DJ', 'total': 'DK'},
{'year': 2023, 'urban': 'DL', 'town': 'DM', 'rural': 'DN', 'total': 'DO'},
{'year': 2024, 'urban': 'DP', 'town': 'DQ', 'rural': 'DR', 'total': 'DS'}
]
},
{
'name': 'senior',
'chinese_name': '高中教育',
'categories': ['urban', 'town', 'rural', 'total'],
'columns': [
{'year': 2015, 'urban': 'DT', 'town': 'DU', 'rural': 'DV', 'total': 'DW'},
{'year': 2016, 'urban': 'DX', 'town': 'DY', 'rural': 'DZ', 'total': 'EA'},
{'year': 2017, 'urban': 'EB', 'town': 'EC', 'rural': 'ED', 'total': 'EE'},
{'year': 2018, 'urban': 'EF', 'town': 'EG', 'rural': 'EH', 'total': 'EI'},
{'year': 2019, 'urban': 'EJ', 'town': 'EK', 'rural': 'EL', 'total': 'EM'},
{'year': 2020, 'urban': 'EN', 'town': 'EO', 'rural': 'EP', 'total': 'EQ'},
{'year': 2021, 'urban': 'ER', 'town': 'ES', 'rural': 'ET', 'total': 'EU'},
{'year': 2022, 'urban': 'EV', 'town': 'EW', 'rural': 'EX', 'total': 'EY'},
{'year': 2023, 'urban': 'EZ', 'town': 'FA', 'rural': 'FB', 'total': 'FC'},
{'year': 2024, 'urban': 'FD', 'town': 'FE', 'rural': 'FF', 'total': 'FG'}
]
},
{
'name': 'vocational',
'chinese_name': '中职教育',
'categories': ['total'],
'columns': [
{'year': 2015, 'total': 'FH'},
{'year': 2016, 'total': 'FI'},
{'year': 2017, 'total': 'FJ'},
{'year': 2018, 'total': 'FK'},
{'year': 2019, 'total': 'FL'},
{'year': 2020, 'total': 'FM'},
{'year': 2021, 'total': 'FN'},
{'year': 2022, 'total': 'FO'},
{'year': 2023, 'total': 'FP'},
{'year': 2024, 'total': 'FQ'}
]
}
]
def process_value(value):
"""处理单元格值,转换为适当类型"""
if value is None:
return 0
# 转换为字符串处理
str_value = str(value).strip()
# 处理空字符串
if not str_value:
return 0
# 处理特殊标记
if str_value in ['####', 'NA', 'N/A', '', ' ']:
return 0
# 处理数字
try:
# 移除千分位逗号
if ',' in str_value:
str_value = str_value.replace(',', '')
return int(float(str_value))
except ValueError:
return 0
def extract_enrollment_data(sheet: openpyxl.worksheet.worksheet.Worksheet) -> Tuple[List[Dict[str, Any]], List[Dict[str, str]], List[str]]:
"""提取招生数据并处理区域名称转换
Args:
sheet: 工作表对象
Returns:
Tuple包含:
- enrollment_data: 提取的招生数据列表
- conversion_records: 区域名称转换记录
- errors: 错误信息列表
"""
enrollment_data: List[Dict[str, Any]] = []
conversion_records: List[Dict[str, str]] = []
errors: List[str] = []
processed_count = 0
for row_num, row in enumerate(sheet.iter_rows(min_row=START_ROW, values_only=True), start=START_ROW):
# 获取区域名称B列索引1
raw_name = row[1] if (len(row) > 1 and row[1] is not None) else '未知地区'
if not raw_name: # 跳过空行
continue
# 转换区域名称
area_name, area_code, row_conversions, row_errors = convert_area_name(raw_name, row_num)
conversion_records.extend(row_conversions)
errors.extend(row_errors)
# 创建区域数据对象
area_info = {
'area_name': area_name,
'area_code': area_code,
'raw_name': raw_name,
'education_data': {}
}
# 提取各教育阶段招生数据
for stage in EDUCATION_STAGES:
stage_name = stage['name']
stage_data = {}
categories = stage['categories']
for year_config in stage['columns']:
year = year_config['year']
year_data = {}
for category in categories:
col_name = year_config[category]
col_idx = openpyxl.utils.column_index_from_string(col_name) - 1
value = row[col_idx] if col_idx < len(row) else None
year_data[category] = process_value(value)
stage_data[str(year)] = year_data
area_info['education_data'][stage_name] = stage_data
enrollment_data.append(area_info)
processed_count += 1
# 进度提示每10行
if processed_count % 10 == 0:
print(f"🔄 已处理 {processed_count} 条数据...")
return enrollment_data, conversion_records, errors
# ======================== 主函数 ========================
def main() -> None:
"""主函数:加载数据并执行提取流程"""
try:
# 初始化目录
init_directories(DATA_DIR)
# 加载工作表
sheet = load_workbook_sheet(EXCEL_PATH, SHEET_NAME)
if not sheet:
return
print(f"✅ 成功加载Excel文件{EXCEL_PATH}")
print(f"✅ 开始处理招生数数据,共{sheet.max_row}行数据")
# 提取招生数据
enrollment_data, conversion_records, errors = extract_enrollment_data(sheet)
# 打印转换统计
print_conversion_stats(conversion_records, errors)
# 保存数据到JSON
save_to_json(enrollment_data, JSON_PATH)
except Exception as e:
print(f"🔴 处理数据时发生错误:{str(e)}")
print(traceback.format_exc())
if __name__ == '__main__':
main()