You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

108 lines
4.4 KiB

import datetime
import logging
from typing import List, Dict, Any
class DataMapper:
def __init__(self, date_columns: List[str] = None, uint64_columns: List[str] = None):
"""
初始化数据映射器
:param date_columns: 公共日期字段列表
:param uint64_columns: 公共整型字段列表
"""
self.base_date_columns = date_columns if date_columns else []
self.base_uint64_columns = uint64_columns if uint64_columns else []
self.logger = logging.getLogger(__name__)
def map_row(self,
columns: List[str],
row: tuple,
date_columns: List[str] = None,
uint_columns: List[str] = None) -> Dict[str, Any]:
"""
映射单行数据到ClickHouse兼容格式
:param columns: 字段名称列表
:param row: MySQL原始数据行
:param date_columns: 表特有日期字段
:param uint_columns: 表特有整型字段
:return: 转换后的字典
"""
# 合并公共字段和表特有字段
all_date_columns = self._merge_columns(self.base_date_columns, date_columns)
all_uint_columns = self._merge_columns(self.base_uint64_columns, uint_columns)
mapped = {}
for col_name, value in zip(columns, row):
try:
# 处理空值
if value is None:
mapped[col_name] = self._handle_null(col_name, all_uint_columns)
continue
# 日期类型处理
if col_name in all_date_columns:
mapped[col_name] = self._convert_datetime(value)
# 整型处理
elif col_name in all_uint_columns:
mapped[col_name] = self._convert_uint(value)
# 默认处理
else:
mapped[col_name] = value
except Exception as e:
self.logger.warning(f"字段 {col_name} 转换异常: {str(e)}")
mapped[col_name] = value # 保持原始值
return mapped
def _merge_columns(self, base: List[str], specific: List[str] = None) -> List[str]:
"""合并公共字段和表特有字段"""
if specific is None:
return base.copy()
return list(set(base + specific))
def _convert_datetime(self, value) -> datetime.datetime:
"""转换日期时间字段(增加范围校验)"""
MIN_DATE = datetime.datetime(1970, 1, 1)
MAX_DATE = datetime.datetime(2105, 12, 31, 23, 59, 59)
try:
"""转换日期时间字段"""
if isinstance(value, datetime.datetime):
return value.replace(tzinfo=None) # 移除时区信息
if isinstance(value, str):
# 处理多种日期格式
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d', '%Y%m%d %H%M%S'):
try:
return datetime.datetime.strptime(value, fmt)
except ValueError:
continue
raise ValueError(f"无法解析的日期格式: {value}")
except Exception as e:
self.logger.warning(f"日期转换异常: {value}, 使用默认值")
return MIN_DATE # 返回合法的最小日期
# 增加范围校验
if converted < MIN_DATE or converted > MAX_DATE:
self.logger.warning(f"日期 {converted} 超出范围,使用默认值")
return MAX_DATE
return converted.replace(microsecond=0) # 去除微秒
def _convert_uint(self, value) -> int:
"""转换无符号整型"""
if isinstance(value, int):
return value
if isinstance(value, str) and value.isdigit():
return int(value)
if isinstance(value, float):
return int(value)
raise TypeError(f"无法转换为整型: {type(value)} {value}")
def _handle_null(self, col_name: str, uint_columns: List[str]) -> Any:
"""处理空值"""
if col_name in uint_columns:
return 0 # 整型字段默认0
if col_name.lower().endswith('time') or col_name.lower().startswith('date'):
return datetime.datetime(1970, 1, 1) # 日期字段默认值
return '' # 其他字段默认空字符串