You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

91 lines
3.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from datetime import datetime
from utils.logger import configure_logger
logger = configure_logger()
class DataMapper:
def __init__(self, mysql_conn, table):
self.mysql_conn = mysql_conn
self.table = table
self.date_columns = []
self.uint64_columns = []
self._analyze_schema()
self.min_date = datetime(1970, 1, 1)
self.max_date = datetime(2105, 12, 31, 23, 59, 59)
def _analyze_schema(self):
"""分析表结构自动识别字段类型"""
schema_query = f"""
SELECT COLUMN_NAME, DATA_TYPE, COLUMN_TYPE
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_SCHEMA = DATABASE()
AND TABLE_NAME = '{self.table}'
"""
with self.mysql_conn.connect().cursor() as cursor:
cursor.execute(schema_query)
for col_name, data_type, col_type in cursor.fetchall():
# 识别日期时间字段
if data_type in ('datetime', 'timestamp', 'date'):
self.date_columns.append(col_name)
# 识别无符号整数字段匹配BIGINT UNSIGNED等
if 'unsigned' in col_type.lower() and 'int' in data_type:
self.uint64_columns.append(col_name)
#logger.info(f"自动识别字段类型 - 日期字段: {self.date_columns}")
#logger.info(f"自动识别字段类型 - 无符号整数字段: {self.uint64_columns}")
def map_row(self, columns, row):
row_dict = dict(zip(columns, row))
return {col: self._map_value(col, val) for col, val in row_dict.items()}
def _map_value(self, col, value):
if col in self.uint64_columns:
return self._handle_uint64(value)
elif col in self.date_columns:
return self._handle_datetime(value)
elif isinstance(value, str):
return value.strip()
return value
def _handle_uint64(self, value):
try:
return int(float(value)) if value not in (None, '', 'NULL') else 0
except:
return 0
def _handle_datetime(self, value):
dt = self._parse_datetime(value)
return dt if dt else self.min_date
def _parse_datetime(self, value):
if value in (None, 0, '0', '0.0', '0.00', '', 'null', 'NULL'):
return self.min_date
try:
str_value = str(value).strip()
for fmt in ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d', '%Y%m%d%H%M%S', '%Y/%m/%d %H:%M:%S'):
try:
parsed = datetime.strptime(str_value, fmt)
return self._clamp_datetime(parsed)
except ValueError:
continue
if str_value.isdigit():
ts = int(str_value)
if 1e12 < ts < 1e13: # 毫秒级时间戳
parsed = datetime.fromtimestamp(ts / 1000)
elif 1e9 < ts < 1e10: # 秒级时间戳
parsed = datetime.fromtimestamp(ts)
return self._clamp_datetime(parsed)
return self.min_date
except:
return self.min_date
def _clamp_datetime(self, dt):
if dt < self.min_date:
return self.min_date
elif dt > self.max_date:
return self.max_date
return dt