You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

234 lines
8.7 KiB

6 months ago
import pymysql
from pymysql import OperationalError, InterfaceError
6 months ago
from tqdm import tqdm
6 months ago
import time
import psutil
import logging
6 months ago
from datetime import datetime
6 months ago
logger = logging.getLogger(__name__)
6 months ago
6 months ago
6 months ago
class SyncService:
6 months ago
def __init__(self, mysql_conn, ch_conn, mapper):
6 months ago
self.mysql_conn = mysql_conn
self.ch_conn = ch_conn
self.mapper = mapper
6 months ago
self.columns = None
6 months ago
self.full_sync_interval = 24 * 3600 # 全量同步间隔
6 months ago
self.last_full_sync = 0
6 months ago
self.recent_check_count = 5000 # 近期数据检查量
self.optimize_frequency = 100 # 每100批优化一次
6 months ago
6 months ago
def sync_data(self, table, batch_size):
6 months ago
"""智能同步入口"""
6 months ago
current_time = time.time()
self.columns = self._load_table_columns(table)
6 months ago
6 months ago
if current_time - self.last_full_sync > self.full_sync_interval:
self._full_sync(table, batch_size)
self.last_full_sync = current_time
else:
6 months ago
self._incremental_sync(table, batch_size)
6 months ago
6 months ago
def _full_sync(self, table, batch_size):
6 months ago
"""全量数据同步"""
6 months ago
logger.info(f"🚀 开始全量同步{table}")
6 months ago
6 months ago
with self.mysql_conn.connect() as mysql_conn:
try:
with mysql_conn.cursor() as count_cursor:
count_cursor.execute(f"SELECT COUNT(*) FROM {table}")
total = count_cursor.fetchone()[0]
6 months ago
6 months ago
with mysql_conn.cursor(pymysql.cursors.SSCursor) as cursor:
cursor.execute(f"SELECT * FROM {table} ORDER BY id")
self._sync_with_cursor(cursor, total, table, batch_size)
except Exception as e:
logger.error(f"全量同步失败: {str(e)}")
raise
finally:
self._optimize_table(table)
6 months ago
6 months ago
logger.info(f"✅ 全量同步{table}完成")
6 months ago
6 months ago
def _incremental_sync(self, table, batch_size):
6 months ago
"""增量数据同步"""
6 months ago
last_id = self._get_last_id_from_ch(table)
6 months ago
logger.info(f"🔁 开始增量同步{table}起始ID: {last_id}")
6 months ago
6 months ago
# 标准增量同步
6 months ago
with self.mysql_conn.connect() as mysql_conn:
try:
with mysql_conn.cursor() as count_cursor:
count_cursor.execute(
f"SELECT COUNT(*) FROM {table} WHERE id > {last_id}"
)
total = count_cursor.fetchone()[0]
with mysql_conn.cursor(pymysql.cursors.SSCursor) as cursor:
cursor.execute(
f"SELECT * FROM {table} WHERE id > {last_id} ORDER BY id"
)
self._sync_with_cursor(cursor, total, table, batch_size)
except Exception as e:
logger.error(f"增量同步失败: {str(e)}")
raise
# 安全近期数据同步
6 months ago
self._sync_recent_data(table, batch_size)
logger.info(f"✅ 增量同步{table}完成最后ID: {self._get_last_id_from_ch(table)}")
def _sync_recent_data(self, table, batch_size):
6 months ago
"""安全近期数据同步"""
logger.info(f"🔄 开始安全同步{table}近期数据")
6 months ago
6 months ago
mysql_max_id = self._get_mysql_max_id(table)
ch_max_id = self._get_last_id_from_ch(table)
safe_start = max(mysql_max_id - self.recent_check_count, ch_max_id + 1)
6 months ago
6 months ago
if safe_start > mysql_max_id:
logger.info("⏩ 无需近期数据同步")
return
6 months ago
6 months ago
with self.mysql_conn.connect() as mysql_conn:
try:
with mysql_conn.cursor(pymysql.cursors.SSCursor) as cursor:
cursor.execute(
f"SELECT * FROM {table} "
f"WHERE id BETWEEN {safe_start} AND {mysql_max_id} "
f"ORDER BY id"
)
total = mysql_max_id - safe_start + 1
progress = tqdm(total=total, desc="近期数据", unit="rec")
batch = []
insert_count = 0
6 months ago
6 months ago
while True:
row = cursor.fetchone()
if not row:
break
6 months ago
6 months ago
mapped = self.mapper.map_row(self.columns, row)
batch.append(mapped)
6 months ago
6 months ago
if len(batch) >= batch_size:
self._insert_batch(batch, table)
insert_count += 1
progress.update(len(batch))
batch = []
6 months ago
6 months ago
if insert_count % self.optimize_frequency == 0:
self._optimize_table(table)
6 months ago
6 months ago
if batch:
6 months ago
self._insert_batch(batch, table)
progress.update(len(batch))
6 months ago
progress.close()
6 months ago
6 months ago
except Exception as e:
logger.error(f"近期数据同步失败: {str(e)}")
raise
6 months ago
6 months ago
logger.info(f"🆗 近期数据同步范围ID: {safe_start}-{mysql_max_id}")
6 months ago
def _sync_with_cursor(self, cursor, total, table, batch_size):
"""通用同步流程"""
progress = tqdm(total=total, desc="同步进度", unit="rec")
batch = []
6 months ago
insert_count = 0
6 months ago
last_success_time = time.time()
while True:
try:
6 months ago
# 每5分钟检查连接
if time.time() - last_success_time > 300:
6 months ago
cursor.connection.ping(reconnect=True)
last_success_time = time.time()
row = cursor.fetchone()
if not row:
break
mapped = self.mapper.map_row(self.columns, row)
batch.append(mapped)
if len(batch) >= batch_size:
6 months ago
self._insert_batch(batch, table)
6 months ago
insert_count += 1
6 months ago
progress.update(len(batch))
6 months ago
batch = []
last_success_time = time.time()
6 months ago
6 months ago
# 定期优化表
if insert_count % self.optimize_frequency == 0:
self._optimize_table(table)
6 months ago
except (OperationalError, InterfaceError) as e:
6 months ago
logger.warning("⚠️ 连接中断,尝试重新连接...")
6 months ago
cursor.connection.ping(reconnect=True)
time.sleep(5)
continue
6 months ago
6 months ago
if batch:
self._insert_batch(batch, table)
progress.update(len(batch))
progress.close()
6 months ago
def _insert_batch(self, batch, table):
6 months ago
"""安全批量插入"""
6 months ago
mem = psutil.virtual_memory()
if mem.percent > 90:
6 months ago
logger.warning("🛑 内存使用超过90%暂停处理60秒")
6 months ago
time.sleep(60)
try:
self.ch_conn.connect().execute(
f'INSERT INTO {table} VALUES',
batch,
types_check=True,
settings={
'date_time_input_format': 'best_effort',
'allow_experimental_analyzer': 0,
6 months ago
'input_format_null_as_default': 1,
6 months ago
'max_partitions_per_insert_block': 1000,
'optimize_on_insert': 1
6 months ago
}
)
except Exception as e:
6 months ago
logger.error(f"❌ 批量插入失败: {str(e)}")
# 可添加重试逻辑
def _optimize_table(self, table):
"""优化ClickHouse表"""
try:
self.ch_conn.connect().execute(f"OPTIMIZE TABLE {table} FINAL")
logger.debug(f"{table}优化完成")
except Exception as e:
logger.warning(f"表优化失败: {str(e)}")
def _get_last_id_from_ch(self, table):
"""获取ClickHouse最大ID"""
try:
result = self.ch_conn.connect().execute(f"SELECT max(id) FROM {table}")
return result[0][0] or 0
except Exception as e:
logger.error(f"获取最大ID失败: {str(e)}")
return 0
def _get_mysql_max_id(self, table):
"""获取MySQL最大ID"""
with self.mysql_conn.connect() as conn:
with conn.cursor() as cursor:
cursor.execute(f"SELECT MAX(id) FROM {table}")
return cursor.fetchone()[0] or 0
def _load_table_columns(self, table):
"""加载表结构"""
result = self.ch_conn.connect().execute(f"DESCRIBE TABLE {table}")
return [row[0] for row in result]