一、使用场景
最近工作中有研发提出一个需求,从内网的一个Mysql 数据库 同步一些表数据到 云上的一个Mysql数据库中,同步时还需要增加一些筛选条件,随自己开发了一个Python 脚本,进行数据同步。
二、脚本思路
- 读取源表的数据
- 把读取到的数据存储到内存中,进行分批操作
- 依次按照批次将数据插入新库新表中
- 将每步的记录 记录到日志文件中
- 同步完成发送通知到企业微信机器人
每批数量可以配置
数据表中的查询条件可以配置
几千万的数据 一次性读取,预计占用内存 16G以下,是为了放置重复数据写入动作
三、脚本内容
from datetime import datetime
import requests
import json
import pymysql
import time
import threading
import logging
import os
#每次处理一批 包含多少行数据
batch_size = 100000
#需要增量同步的表
Incremental_table_list = [
["tableA", "tableA"],
["tableB", "tableB"],
]
#需要全量同步的表
Full_table_list = [
["tableC", "tableC"],
["tableD", "tableD"],
]
yewu1_db_config = {
'host': 'xx.xx.xx.xx',
'port': 3306,
'user': 'tom',
'password': '123456',
'db': 'test1',
}
yewu2_db_config = {
'host': 'xx.xx.xx.xx',
'port': 3306,
'user': 'ali',
'password': '123456',
'db': 'test2',
}
# 获取脚本所在目录
script_dir = os.path.dirname(os.path.abspath(__file__))
# 构建完整的日志文件路径
log_file_path = os.path.join(script_dir, 'sync_data.log')
# 配置日志模块
logging.basicConfig(
level=logging.INFO, # 设置日志级别,这里是INFO级别,根据需要可调整为DEBUG、ERROR等
format='%(asctime)s - %(levelname)s - %(message)s', # 设置日志格式
filename=log_file_path, # 指定日志文件名
filemode='a', # 设置文件模式为追加模式('a'表示追加)
)
def split_list(lst):
return [lst[i:i + batch_size] for i in range(0, len(lst), batch_size)]
def send_wx_work_message(message):
url = f"https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxx"
headers = {"Content-Type": "application/json"}
data = {
"msgtype": "text",
"text": {
"content": message
}
}
response = requests.post(url, headers=headers, data=json.dumps(data))
if response.status_code == 200:
logging.info(f"{message} sent successfully")
else:
logging.error(f"{message} failed")
def count_batches(src_conn, table_name):
with src_conn.cursor() as cursor:
# 先查询总记录数
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
total_records = cursor.fetchone()[0]
# 计算需要多少批次来获取所有记录
total_batches = -(-total_records // batch_size) # 使用天花板除法向上取整
return total_batches, total_records
def count_batches_incremental(src_conn, table_name):
with src_conn.cursor() as cursor:
if table_name == 'table2':
query_01 = f"SELECT count(*) FROM {table_name} where 1=1 "
# 先查询总记录数
cursor.execute(query_01)
total_records = cursor.fetchone()[0]
# 计算需要多少批次来获取所有记录
total_batches = -(-total_records // batch_size) # 使用天花板除法向上取整
return total_batches, total_records
def sync_query(table_name_01):
if table_name_01 == 'table1':
sync_select_query = f"SELECT * FROM {table_name_01} where name = 'a' "
return sync_select_query
def sync_query_all(table_name_01):
sync_select_query_all = f"SELECT * FROM {table_name_01}"
return sync_select_query_all
def get_view_data_in_batches(src_conn, table_name):
while True:
try:
with src_conn.cursor() as cursor:
query = sync_query(table_name)
cursor.execute(query)
rows = cursor.fetchall()
if len(rows) == 0:
logging.info("当前表 %s 没有新增数据", table_name)
if not rows:
break
try:
rows_list = split_list(rows)
except Exception as e:
logging.error(e)
column_names = [desc[0] for desc in cursor.description]
return rows_list, column_names
except Exception as e:
logging.error("从源视图 %s 获取数据时出现错误: %s", table_name, {str(e)})
raise
def get_view_data_in_batches_all(src_conn, table_name):
while True:
try:
with src_conn.cursor() as cursor:
query = sync_query_all(table_name)
cursor.execute(query)
rows = cursor.fetchall()
if not rows:
break
try:
rows_list = split_list(rows)
except Exception as e:
logging.error(e)
column_names = [desc[0] for desc in cursor.description]
return rows_list, column_names
except Exception as e:
logging.error("从源视图获取数据时出现错误: %s", {str(e)})
raise
def clear_table(dst_conn, dst_table_name, dst_db_name_01):
if dst_db_name_01 == 'db1':
try:
with dst_conn.cursor() as cursor:
truncate_query = f"TRUNCATE TABLE {dst_table_name}"
cursor.execute(truncate_query)
dst_conn.commit()
logging.info("已清空目标表: %s", dst_table_name)
except Exception as e:
logging.error("清空目标表时出现错误: %s", str(e))
dst_conn.rollback()
raise
def batch_insert_into_table(dst_conn, rows, column_names, dst_table_name):
columns_str = ', '.join(column_names)
placeholders = ', '.join(['%s'] * len(column_names))
insert_query = f"INSERT INTO {dst_table_name} ({columns_str}) VALUES ({placeholders})"
start_time1 = time.time()
try:
with dst_conn.cursor() as cursor:
cursor.executemany(insert_query, rows) # 直接插入所有数据
total_rows_processed = len(rows) # 假设rows中的数据全部成功处理
dst_conn.commit()
current_time = time.time()
elapsed_time = current_time - start_time1
logging.info(f"已处理 {total_rows_processed} 行, 已耗时: {elapsed_time:.2f} 秒, 当前表: {dst_table_name}")
except Exception as e:
logging.error(f"将数据插入目标表时出现错误: {str(e)}")
dst_conn.rollback()
raise
def sync_date(src_db_config, dst_db_config, src_view_name, dst_table_name):
dst_db = dst_db_config['db']
try:
start_time = time.time()
# 连接源数据库
with pymysql.connect(**src_db_config) as src_conn:
# 连接目标数据库
with pymysql.connect(**dst_db_config) as dst_conn:
total_batches_counted = count_batches_incremental(src_conn, src_view_name)[0]
logging.info(f"增量数据预计需要获取 {total_batches_counted} 批数据, 当前表: {src_view_name}")
# 清空目标表
clear_table(dst_conn, dst_table_name, dst_db)
logging.info(f"增量数据目标数据库字段已经清理,重新导入, 当前表: {dst_table_name}")
try:
rows_list, column_names = get_view_data_in_batches(src_conn, src_view_name)
# 分批次获取视图数据并插入目标表
for idx, rows in enumerate(rows_list, start=1):
try:
logging.info(f"正在增量数据处理数据,正在处理第 %d 批", idx)
batch_insert_into_table(dst_conn, rows, column_names, dst_table_name)
except Exception as e:
logging.error(
f"处理第 {idx} 批增量数据时出现错误: {str(e)},跳过该批次继续处理下一批次, 当前表: {dst_table_name}")
send_wx_work_message('增量数据插入失败')
except Exception as e:
pass
end_time = time.time()
total_elapsed_time = end_time - start_time
logging.info(f"增量数据同步完成! 总共耗时: {total_elapsed_time:.2f} 秒 ,当前表: {dst_table_name} ")
except Exception as e:
logging.error(f"在整个增量同步过程中发生了错误: {str(e)} , 当前表: {dst_table_name}")
def sync_date_all(src_db_config, dst_db_config, src_view_name, dst_table_name):
dst_db = dst_db_config['db']
try:
start_time = time.time()
# 连接源数据库
with pymysql.connect(**src_db_config) as src_conn:
# 连接目标数据库
with pymysql.connect(**dst_db_config) as dst_conn:
total_batches_counted = count_batches(src_conn, src_view_name)[0]
logging.info(f"全量数据预计需要获取 {total_batches_counted} 批数据, 当前表: {src_view_name}")
# 清空目标表
clear_table(dst_conn, dst_table_name, dst_db)
logging.info(f"全量目标数据库字段已经清理,重新导入, 当前表: {dst_table_name}")
rows_list, column_names = get_view_data_in_batches_all(src_conn, src_view_name)
# 分批次获取视图数据并插入目标表
for idx, rows in enumerate(rows_list, start=1):
try:
logging.info(f"正在增量数据处理数据,正在处理第 %d 批", idx)
batch_insert_into_table(dst_conn, rows, column_names, dst_table_name)
except Exception as e:
logging.error(
f"处理第 {idx} 批全量数据时出现错误: {str(e)},跳过该批次继续处理下一批次, 当前表: {dst_table_name}")
send_wx_work_message('全量数据插入失败')
end_time = time.time()
total_elapsed_time = end_time - start_time
logging.info(f"全量数据同步完成! 总共耗时: {total_elapsed_time:.2f} 秒 ,当前表: {dst_table_name} ")
except Exception as e:
logging.error(f"在整个全量数据同步过程中发生了错误: {str(e)} , 当前表: {dst_table_name}")
def sync_and_print(sync_func, *args):
sync_func(*args)
def sync_date_dev():
start_time = time.time()
# 创建线程列表
threads = []
# 创建并启动线程
#同步增量数据
for a, b in Incremental_table_list:
threads.append(threading.Thread(target=sync_and_print, args=(
sync_date, yewu1_db_config, yewu2_db_config, a, a)))
#同步全量数据
for c, d in Full_table_list:
threads.append(threading.Thread(target=sync_and_print, args=(
sync_date_all, yewu1_db_config, yewu2_db_config, c, d)))
# 启动线程
for thread in threads:
thread.start()
# 等待所有线程完成
for thread in threads:
thread.join()
logging.info("所有同步操作已完成")
end_time = time.time()
total_elapsed_time = end_time - start_time
logging.info("dev 环境数据同步完成! 总共耗时: {:.2f} 秒".format(total_elapsed_time))
if __name__ == "__main__":
# 获取当前时间
current_time = datetime.now()
# 将时间格式化为指定的字符串格式
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
logging.info('同步dev环境')
sync_date_dev()
logging.info(f"整体同步完成")
send_wx_work_message('同步完成' + '当期时间:' + formatted_time)
3086

被折叠的 条评论
为什么被折叠?



