Files
YunNanProject/Tools/T4_2_MaoRuXueLv.py
2025-09-10 11:15:03 +08:00

173 lines
7.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import openpyxl # 添加缺少的导入
import json
import os
from Config.Config import EXCEL_PATH
from Util.AreaUtil import query_area_info
# 创建数据保存目录
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'Data')
os.makedirs(DATA_DIR, exist_ok=True)
JSON_PATH = os.path.join(DATA_DIR, 'MaoRuXueLv.json') # 修改为毛入学率的JSON路径
file_name = EXCEL_PATH
enrollment_data = []
name_conversion_errors = [] # 记录转换失败的名称
conversion_records = [] # 定义转换记录变量
try:
# 加载工作簿并选择毛入学率Sheet
workbook = openpyxl.load_workbook(file_name, read_only=True)
if '毛入学率' not in workbook.sheetnames:
print("❌ 错误:未找到'毛入学率'Sheet")
exit(1)
sheet = workbook['毛入学率']
# 定义数据列范围与英文属性映射
# 学前教育(交替列逻辑)
data_columns = {
# 学前教育 - 交替列映射2015-2024
'preschool_enrollment': {
'columns': ['D', 'F', 'H', 'J', 'L', 'N', 'P', 'R', 'T', 'V'],
'years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
},
'preschool_enrollment_rate': {
'columns': ['E', 'G', 'I', 'K', 'M', 'O', 'Q', 'S', 'U', 'W'],
'years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
},
# 小学教育X-AQ列交替列逻辑
'primary_enrollment': {
'columns': ['X', 'Z', 'AB', 'AD', 'AF', 'AH', 'AJ', 'AL', 'AN', 'AP'],
'years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
},
'primary_enrollment_rate': {
'columns': ['Y', 'AA', 'AC', 'AE', 'AG', 'AI', 'AK', 'AM', 'AO', 'AQ'],
'years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
},
# 初中教育AR-BK列交替列逻辑
'junior_high_enrollment': {
'columns': ['AR', 'AT', 'AV', 'AX', 'AZ', 'BB', 'BD', 'BF', 'BH', 'BJ'],
'years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
},
'junior_high_enrollment_rate': {
'columns': ['AS', 'AU', 'AW', 'AY', 'BA', 'BC', 'BE', 'BG', 'BI', 'BK'],
'years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
},
# 普通高中教育BL-CE列交替列逻辑
'senior_high_enrollment': {
'columns': ['BL', 'BN', 'BP', 'BR', 'BT', 'BV', 'BX', 'BZ', 'CB', 'CD'],
'years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
},
'senior_high_enrollment_rate': {
'columns': ['BM', 'BO', 'BQ', 'BS', 'BU', 'BW', 'BY', 'CA', 'CC', 'CE'],
'years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
},
# 中职教育CF-CY列交替列逻辑
'vocational_enrollment': {
'columns': ['CF', 'CH', 'CJ', 'CL', 'CN', 'CP', 'CR', 'CT', 'CV', 'CX'],
'years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
},
'vocational_enrollment_rate': {
'columns': ['CG', 'CI', 'CK', 'CM', 'CO', 'CQ', 'CS', 'CU', 'CW', 'CY'],
'years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
}
}
# 遍历数据行跳过前4行表头
for row_num, row in enumerate(sheet.iter_rows(min_row=5, values_only=True), start=5):
# 区域名称从B列获取索引1原代码是从A列索引0获取
raw_name = row[1] if (len(row) > 1 and row[1] is not None) else '未知地区'
if not raw_name: # 跳过空行
continue
# 区域名称转换(核心修改)
# 确保raw_name为字符串类型再调用strip()
str_raw_name = str(raw_name).strip() if raw_name is not None else '未知地区'
area_info = query_area_info(str_raw_name)
if area_info:
area_name = area_info['full_name']
area_code = area_info['area_code']
# 检查是否发生实际转换
if raw_name != area_name:
conversion_records.append({
'row': row_num,
'raw_name': raw_name,
'converted_name': area_name
})
else:
area_name = raw_name
area_code = 'unknown'
name_conversion_errors.append(f"{row_num}: '{raw_name}'")
area_data = {
'area_name': area_name,
'area_code': area_code,
'raw_name': raw_name # 保留原始名称用于调试
}
# 提取各指标年度数据
for metric, config in data_columns.items():
year_data = {}
# 仅保留显式列名映射处理逻辑(完全移除旧格式代码)
if 'columns' in config and 'years' in config:
# 遍历预设的列名和年份对应关系
for col_name, year in zip(config['columns'], config['years']):
col_idx = openpyxl.utils.column_index_from_string(col_name) - 1
if col_idx < len(row):
value = row[col_idx]
# 处理空值和非数值(增强版)
if value is None:
year_data[str(year)] = 0
else:
# 统一转换为字符串处理
str_value = str(value).strip()
if str_value == '' or str_value == '####':
year_data[str(year)] = 0
else:
try:
if '%' in str_value:
# 移除百分号并转换为小数
year_data[str(year)] = float(str_value.replace('%', ''))
else:
year_data[str(year)] = float(str_value) if '.' in str_value else int(str_value)
except (ValueError, TypeError):
year_data[str(year)] = 0
# 删除旧格式的start_col/end_col处理分支
area_data[metric] = year_data
enrollment_data.append(area_data)
workbook.close()
# 保存为JSON文件
with open(JSON_PATH, 'w', encoding='utf-8') as f:
json.dump(enrollment_data, f, ensure_ascii=False, indent=2)
# 输出转换结果统计
print(f"✅ 毛入学率数据提取完成,已保存至:{JSON_PATH}")
print(f"📊 共处理 {len(enrollment_data)} 条地区数据")
# 输出转换校验结果
print("\n=== 名称转换记录 ===")
if conversion_records:
for record in conversion_records:
print(f"🔄 行 {record['row']}: {record['raw_name']}{record['converted_name']}")
print(f"📊 共检测到 {len(conversion_records)} 项名称转换")
else:
print("📝 不存在名称转换的情况")
if name_conversion_errors:
print(f"⚠️ 发现 {len(name_conversion_errors)} 个区域名称转换失败:")
for error in name_conversion_errors:
print(f" - {error}")
else:
print("✅ 所有区域名称均成功转换为全称")
except FileNotFoundError:
print(f"🔴 错误Excel文件 '{file_name}' 不存在")
except Exception as e:
print(f"🔴 处理数据时发生错误:{str(e)}")