数据库之间数据同步脚本(Python)

一、使用场景

        最近工作中有研发提出一个需求,从内网的一个Mysql 数据库 同步一些表数据到 云上的一个Mysql数据库中,同步时还需要增加一些筛选条件,随自己开发了一个Python 脚本,进行数据同步。

二、脚本思路

  1. 读取源表的数据
  2. 把读取到的数据存储到内存中,进行分批操作
  3. 依次按照批次将数据插入新库新表中
  4. 将每步的记录 记录到日志文件中
  5. 同步完成发送通知到企业微信机器人

每批数量可以配置

数据表中的查询条件可以配置

几千万的数据 一次性读取,预计占用内存 16G以下,是为了放置重复数据写入动作

三、脚本内容

from datetime import datetime
import requests
import json
import pymysql
import time
import threading
import logging
import os


#每次处理一批 包含多少行数据
batch_size = 100000

#需要增量同步的表
Incremental_table_list = [
    ["tableA", "tableA"],
    ["tableB", "tableB"],
]

#需要全量同步的表
Full_table_list = [
    ["tableC", "tableC"],
    ["tableD", "tableD"],
]

yewu1_db_config = {
    'host': 'xx.xx.xx.xx',
    'port': 3306,
    'user': 'tom',
    'password': '123456',
    'db': 'test1',
}

yewu2_db_config = {
    'host': 'xx.xx.xx.xx',
    'port': 3306,
    'user': 'ali',
    'password': '123456',
    'db': 'test2',
}

# 获取脚本所在目录
script_dir = os.path.dirname(os.path.abspath(__file__))
# 构建完整的日志文件路径
log_file_path = os.path.join(script_dir, 'sync_data.log')

# 配置日志模块
logging.basicConfig(
    level=logging.INFO,  # 设置日志级别,这里是INFO级别,根据需要可调整为DEBUG、ERROR等
    format='%(asctime)s - %(levelname)s - %(message)s',  # 设置日志格式
    filename=log_file_path,  # 指定日志文件名
    filemode='a',  # 设置文件模式为追加模式('a'表示追加)
)


def split_list(lst):
    return [lst[i:i + batch_size] for i in range(0, len(lst), batch_size)]


def send_wx_work_message(message):
    url = f"https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxx"
    headers = {"Content-Type": "application/json"}
    data = {
        "msgtype": "text",
        "text": {
            "content": message
        }
    }
    response = requests.post(url, headers=headers, data=json.dumps(data))
    if response.status_code == 200:
        logging.info(f"{message} sent successfully")
    else:
        logging.error(f"{message} failed")


def count_batches(src_conn, table_name):
    with src_conn.cursor() as cursor:
        # 先查询总记录数
        cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
        total_records = cursor.fetchone()[0]

    # 计算需要多少批次来获取所有记录
    total_batches = -(-total_records // batch_size)  # 使用天花板除法向上取整

    return total_batches, total_records

def count_batches_incremental(src_conn, table_name):
    with src_conn.cursor() as cursor:
        if table_name == 'table2':
            query_01 = f"SELECT count(*) FROM {table_name} where 1=1 "
        # 先查询总记录数
        cursor.execute(query_01)
        total_records = cursor.fetchone()[0]

    # 计算需要多少批次来获取所有记录
    total_batches = -(-total_records // batch_size)  # 使用天花板除法向上取整

    return total_batches, total_records


def sync_query(table_name_01):
    if table_name_01 == 'table1':
        sync_select_query = f"SELECT * FROM {table_name_01} where name = 'a' "
    return sync_select_query


def sync_query_all(table_name_01):
    sync_select_query_all = f"SELECT * FROM {table_name_01}"
    return sync_select_query_all


def get_view_data_in_batches(src_conn, table_name):
    while True:
        try:
            with src_conn.cursor() as cursor:
                query = sync_query(table_name)
                cursor.execute(query)
                rows = cursor.fetchall()
                if len(rows) == 0:
                    logging.info("当前表 %s 没有新增数据", table_name)
                if not rows:
                    break
                try:
                    rows_list = split_list(rows)
                except Exception as e:
                    logging.error(e)
                column_names = [desc[0] for desc in cursor.description]
                return rows_list, column_names
        except Exception as e:
            logging.error("从源视图 %s 获取数据时出现错误: %s", table_name, {str(e)})
            raise


def get_view_data_in_batches_all(src_conn, table_name):
    while True:
        try:
            with src_conn.cursor() as cursor:
                query = sync_query_all(table_name)
                cursor.execute(query)
                rows = cursor.fetchall()
                if not rows:
                    break
                try:
                    rows_list = split_list(rows)
                except Exception as e:
                    logging.error(e)
                column_names = [desc[0] for desc in cursor.description]
                return rows_list, column_names
        except Exception as e:
            logging.error("从源视图获取数据时出现错误: %s", {str(e)})
            raise


def clear_table(dst_conn, dst_table_name, dst_db_name_01):
    if dst_db_name_01 == 'db1':
        try:
            with dst_conn.cursor() as cursor:
                truncate_query = f"TRUNCATE TABLE {dst_table_name}"
                cursor.execute(truncate_query)
                dst_conn.commit()
                logging.info("已清空目标表: %s", dst_table_name)
        except Exception as e:
            logging.error("清空目标表时出现错误: %s", str(e))
            dst_conn.rollback()
            raise


def batch_insert_into_table(dst_conn, rows, column_names, dst_table_name):
    columns_str = ', '.join(column_names)
    placeholders = ', '.join(['%s'] * len(column_names))
    insert_query = f"INSERT INTO {dst_table_name} ({columns_str}) VALUES ({placeholders})"

    start_time1 = time.time()

    try:
        with dst_conn.cursor() as cursor:
            cursor.executemany(insert_query, rows)  # 直接插入所有数据
            total_rows_processed = len(rows)  # 假设rows中的数据全部成功处理

        dst_conn.commit()

        current_time = time.time()
        elapsed_time = current_time - start_time1
        logging.info(f"已处理 {total_rows_processed} 行, 已耗时: {elapsed_time:.2f} 秒, 当前表: {dst_table_name}")

    except Exception as e:
        logging.error(f"将数据插入目标表时出现错误: {str(e)}")
        dst_conn.rollback()
        raise


def sync_date(src_db_config, dst_db_config, src_view_name, dst_table_name):
    dst_db = dst_db_config['db']
    try:
        start_time = time.time()
        # 连接源数据库
        with pymysql.connect(**src_db_config) as src_conn:
            # 连接目标数据库
            with pymysql.connect(**dst_db_config) as dst_conn:

                total_batches_counted = count_batches_incremental(src_conn, src_view_name)[0]
                logging.info(f"增量数据预计需要获取 {total_batches_counted} 批数据, 当前表: {src_view_name}")
                # 清空目标表
                clear_table(dst_conn, dst_table_name, dst_db)
                logging.info(f"增量数据目标数据库字段已经清理,重新导入, 当前表: {dst_table_name}")
                try:
                    rows_list, column_names = get_view_data_in_batches(src_conn, src_view_name)
                    # 分批次获取视图数据并插入目标表
                    for idx, rows in enumerate(rows_list, start=1):
                        try:
                            logging.info(f"正在增量数据处理数据,正在处理第 %d 批", idx)
                            batch_insert_into_table(dst_conn, rows, column_names, dst_table_name)
                        except Exception as e:
                            logging.error(
                                f"处理第 {idx} 批增量数据时出现错误: {str(e)},跳过该批次继续处理下一批次, 当前表: {dst_table_name}")
                            send_wx_work_message('增量数据插入失败')
                except Exception as e:
                    pass
        end_time = time.time()
        total_elapsed_time = end_time - start_time
        logging.info(f"增量数据同步完成! 总共耗时: {total_elapsed_time:.2f} 秒 ,当前表: {dst_table_name} ")

    except Exception as e:
        logging.error(f"在整个增量同步过程中发生了错误: {str(e)} , 当前表: {dst_table_name}")


def sync_date_all(src_db_config, dst_db_config, src_view_name, dst_table_name):
    dst_db = dst_db_config['db']
    try:
        start_time = time.time()
        # 连接源数据库
        with pymysql.connect(**src_db_config) as src_conn:
            # 连接目标数据库
            with pymysql.connect(**dst_db_config) as dst_conn:
                total_batches_counted = count_batches(src_conn, src_view_name)[0]
                logging.info(f"全量数据预计需要获取 {total_batches_counted} 批数据, 当前表: {src_view_name}")
                # 清空目标表
                clear_table(dst_conn, dst_table_name, dst_db)
                logging.info(f"全量目标数据库字段已经清理,重新导入, 当前表: {dst_table_name}")
                rows_list, column_names = get_view_data_in_batches_all(src_conn, src_view_name)
                # 分批次获取视图数据并插入目标表

                for idx, rows in enumerate(rows_list, start=1):
                    try:
                        logging.info(f"正在增量数据处理数据,正在处理第 %d 批", idx)
                        batch_insert_into_table(dst_conn, rows, column_names, dst_table_name)
                    except Exception as e:
                        logging.error(
                            f"处理第 {idx} 批全量数据时出现错误: {str(e)},跳过该批次继续处理下一批次, 当前表: {dst_table_name}")
                        send_wx_work_message('全量数据插入失败')

        end_time = time.time()
        total_elapsed_time = end_time - start_time
        logging.info(f"全量数据同步完成! 总共耗时: {total_elapsed_time:.2f} 秒 ,当前表: {dst_table_name} ")
    except Exception as e:
        logging.error(f"在整个全量数据同步过程中发生了错误: {str(e)} , 当前表: {dst_table_name}")


def sync_and_print(sync_func, *args):
    sync_func(*args)


def sync_date_dev():
    start_time = time.time()

    # 创建线程列表
    threads = []

    # 创建并启动线程

    #同步增量数据
    for a, b in Incremental_table_list:
        threads.append(threading.Thread(target=sync_and_print, args=(
            sync_date, yewu1_db_config, yewu2_db_config, a, a)))

    #同步全量数据
    for c, d in Full_table_list:
        threads.append(threading.Thread(target=sync_and_print, args=(
            sync_date_all, yewu1_db_config, yewu2_db_config, c, d)))

    # 启动线程
    for thread in threads:
        thread.start()

    # 等待所有线程完成
    for thread in threads:
        thread.join()
    logging.info("所有同步操作已完成")
    end_time = time.time()
    total_elapsed_time = end_time - start_time
    logging.info("dev 环境数据同步完成! 总共耗时: {:.2f} 秒".format(total_elapsed_time))


if __name__ == "__main__":
    # 获取当前时间
    current_time = datetime.now()

    # 将时间格式化为指定的字符串格式
    formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
    logging.info('同步dev环境')
    sync_date_dev()
    logging.info(f"整体同步完成")
    send_wx_work_message('同步完成' + '当期时间:' + formatted_time)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值