You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
108 lines
4.4 KiB
108 lines
4.4 KiB
import datetime
|
|
import logging
|
|
from typing import List, Dict, Any
|
|
|
|
|
|
class DataMapper:
|
|
def __init__(self, date_columns: List[str] = None, uint64_columns: List[str] = None):
|
|
"""
|
|
初始化数据映射器
|
|
:param date_columns: 公共日期字段列表
|
|
:param uint64_columns: 公共整型字段列表
|
|
"""
|
|
self.base_date_columns = date_columns if date_columns else []
|
|
self.base_uint64_columns = uint64_columns if uint64_columns else []
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
def map_row(self,
|
|
columns: List[str],
|
|
row: tuple,
|
|
date_columns: List[str] = None,
|
|
uint_columns: List[str] = None) -> Dict[str, Any]:
|
|
"""
|
|
映射单行数据到ClickHouse兼容格式
|
|
:param columns: 字段名称列表
|
|
:param row: MySQL原始数据行
|
|
:param date_columns: 表特有日期字段
|
|
:param uint_columns: 表特有整型字段
|
|
:return: 转换后的字典
|
|
"""
|
|
# 合并公共字段和表特有字段
|
|
all_date_columns = self._merge_columns(self.base_date_columns, date_columns)
|
|
all_uint_columns = self._merge_columns(self.base_uint64_columns, uint_columns)
|
|
|
|
mapped = {}
|
|
for col_name, value in zip(columns, row):
|
|
try:
|
|
# 处理空值
|
|
if value is None:
|
|
mapped[col_name] = self._handle_null(col_name, all_uint_columns)
|
|
continue
|
|
|
|
# 日期类型处理
|
|
if col_name in all_date_columns:
|
|
mapped[col_name] = self._convert_datetime(value)
|
|
# 整型处理
|
|
elif col_name in all_uint_columns:
|
|
mapped[col_name] = self._convert_uint(value)
|
|
# 默认处理
|
|
else:
|
|
mapped[col_name] = value
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"字段 {col_name} 转换异常: {str(e)}")
|
|
mapped[col_name] = value # 保持原始值
|
|
|
|
return mapped
|
|
|
|
def _merge_columns(self, base: List[str], specific: List[str] = None) -> List[str]:
|
|
"""合并公共字段和表特有字段"""
|
|
if specific is None:
|
|
return base.copy()
|
|
return list(set(base + specific))
|
|
|
|
def _convert_datetime(self, value) -> datetime.datetime:
|
|
"""转换日期时间字段(增加范围校验)"""
|
|
MIN_DATE = datetime.datetime(1970, 1, 1)
|
|
MAX_DATE = datetime.datetime(2105, 12, 31, 23, 59, 59)
|
|
try:
|
|
"""转换日期时间字段"""
|
|
if isinstance(value, datetime.datetime):
|
|
return value.replace(tzinfo=None) # 移除时区信息
|
|
|
|
if isinstance(value, str):
|
|
# 处理多种日期格式
|
|
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d', '%Y%m%d %H%M%S'):
|
|
try:
|
|
return datetime.datetime.strptime(value, fmt)
|
|
except ValueError:
|
|
continue
|
|
raise ValueError(f"无法解析的日期格式: {value}")
|
|
except Exception as e:
|
|
self.logger.warning(f"日期转换异常: {value}, 使用默认值")
|
|
return MIN_DATE # 返回合法的最小日期
|
|
|
|
# 增加范围校验
|
|
if converted < MIN_DATE or converted > MAX_DATE:
|
|
self.logger.warning(f"日期 {converted} 超出范围,使用默认值")
|
|
return MAX_DATE
|
|
|
|
return converted.replace(microsecond=0) # 去除微秒
|
|
|
|
def _convert_uint(self, value) -> int:
|
|
"""转换无符号整型"""
|
|
if isinstance(value, int):
|
|
return value
|
|
if isinstance(value, str) and value.isdigit():
|
|
return int(value)
|
|
if isinstance(value, float):
|
|
return int(value)
|
|
raise TypeError(f"无法转换为整型: {type(value)} {value}")
|
|
|
|
def _handle_null(self, col_name: str, uint_columns: List[str]) -> Any:
|
|
"""处理空值"""
|
|
if col_name in uint_columns:
|
|
return 0 # 整型字段默认0
|
|
if col_name.lower().endswith('time') or col_name.lower().startswith('date'):
|
|
return datetime.datetime(1970, 1, 1) # 日期字段默认值
|
|
return '' # 其他字段默认空字符串 |