数据库之间数据同步脚本（Python）

Murphy-dong

已于 2024-08-29 15:15:33 修改

阅读量124

点赞数 4

分类专栏： Python 文章标签：数据库

于 2024-08-29 15:13:07 首次发布

本文链接：https://blog.csdn.net/qq_29321057/article/details/141635017

版权

Python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

一、使用场景

最近工作中有研发提出一个需求，从内网的一个Mysql 数据库同步一些表数据到云上的一个Mysql数据库中，同步时还需要增加一些筛选条件，随自己开发了一个Python 脚本，进行数据同步。

二、脚本思路

读取源表的数据
把读取到的数据存储到内存中，进行分批操作
依次按照批次将数据插入新库新表中
将每步的记录记录到日志文件中
同步完成发送通知到企业微信机器人

每批数量可以配置

数据表中的查询条件可以配置

几千万的数据一次性读取，预计占用内存 16G以下，是为了放置重复数据写入动作

三、脚本内容

from datetime import datetime
import requests
import json
import pymysql
import time
import threading
import logging
import os


#每次处理一批 包含多少行数据
batch_size = 100000

#需要增量同步的表
Incremental_table_list = [
    ["tableA", "tableA"],
    ["tableB", "tableB"],
]

#需要全量同步的表
Full_table_list = [
    ["tableC", "tableC"],
    ["tableD", "tableD"],
]

yewu1_db_config = {
    'host': 'xx.xx.xx.xx',
    'port': 3306,
    'user': 'tom',
    'password': '123456',
    'db': 'test1',
}

yewu2_db_config = {
    'host': 'xx.xx.xx.xx',
    'port': 3306,
    'user': 'ali',
    'password': '123456',
    'db': 'test2',
}

# 获取脚本所在目录
script_dir = os.path.dirname(os.path.abspath(__file__))
# 构建完整的日志文件路径
log_file_path = os.path.join(script_dir, 'sync_data.log')

# 配置日志模块
logging.basicConfig(
    level=logging.INFO,  # 设置日志级别，这里是INFO级别，根据需要可调整为DEBUG、ERROR等
    format='%(asctime)s - %(levelname)s - %(message)s',  # 设置日志格式
    filename=log_file_path,  # 指定日志文件名
    filemode='a',  # 设置文件模式为追加模式('a'表示追加)
)


def split_list(lst):
    return [lst[i:i + batch_size] for i in range(0, len(lst), batch_size)]


def send_wx_work_message(message):
    url = f"https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxx"
    headers = {"Content-Type": "application/json"}
    data = {
        "msgtype": "text",
        "text": {
            "content": message
        }
    }
    response = requests.post(url, headers=headers, data=json.dumps(data))
    if response.status_code == 200:
        logging.info(f"{message} sent successfully")
    else:
        logging.error(f"{message} failed")


def count_batches(src_conn, table_name):
    with src_conn.cursor() as cursor:
        # 先查询总记录数
        cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
        total_records = cursor.fetchone()[0]

    # 计算需要多少批次来获取所有记录
    total_batches = -(-total_records // batch_size)  # 使用天花板除法向上取整

    return total_batches, total_records

def count_batches_incremental(src_conn, table_name):
    with src_conn.cursor() as cursor:
        if table_name == 'table2':
            query_01 = f"SELECT count(*) FROM {table_name} where 1=1 "
        # 先查询总记录数
        cursor.execute(query_01)
        total_records = cursor.fetchone()[0]

    # 计算需要多少批次来获取所有记录
    total_batches = -(-total_records // batch_size)  # 使用天花板除法向上取整

    return total_batches, total_records


def sync_query(table_name_01):
    if table_name_01 == 'table1':
        sync_select_query = f"SELECT * FROM {table_name_01} where name = 'a' "
    return sync_select_query


def sync_query_all(table_name_01):
    sync_select_query_all = f"SELECT * FROM {table_name_01}"
    return sync_select_query_all


def get_view_data_in_batches(src_conn, table_name):
    while True:
        try:
            with src_conn.cursor() as cursor:
                query = sync_query(table_name)
                cursor.execute(query)
                rows = cursor.fetchall()
                if len(rows) == 0:
                    logging.info("当前表 %s 没有新增数据", table_name)
                if not rows:
                    break
                try:
                    rows_list = split_list(rows)
                except Exception as e:
                    logging.error(e)
                column_names = [desc[0] for desc in cursor.description]
                return rows_list, column_names
        except Exception as e:
            logging.error("从源视图 %s 获取数据时出现错误: %s", table_name, {str(e)})
            raise


def get_view_data_in_batches_all(src_conn, table_name):
    while True:
        try:
            with src_conn.cursor() as cursor:
                query = sync_query_all(table_name)
                cursor.execute(query)
                rows = cursor.fetchall()
                if not rows:
                    break
                try:
                    rows_list = split_list(rows)
                except Exception as e:
                    logging.error(e)
                column_names = [desc[0] for desc in cursor.description]
                return rows_list, column_names
        except Exception as e:
            logging.error("从源视图获取数据时出现错误: %s", {str(e)})
            raise


def clear_table(dst_conn, dst_table_name, dst_db_name_01):
    if dst_db_name_01 == 'db1':
        try:
            with dst_conn.cursor() as cursor:
                truncate_query = f"TRUNCATE TABLE {dst_table_name}"
                cursor.execute(truncate_query)
                dst_conn.commit()
                logging.info("已清空目标表: %s", dst_table_name)
        except Exception as e:
            logging.error("清空目标表时出现错误: %s", str(e))
            dst_conn.rollback()
            raise


def batch_insert_into_table(dst_conn, rows, column_names, dst_table_name):
    columns_str = ', '.join(column_names)
    placeholders = ', '.join(['%s'] * len(column_names))
    insert_query = f"INSERT INTO {dst_table_name} ({columns_str}) VALUES ({placeholders})"

    start_time1 = time.time()

    try:
        with dst_conn.cursor() as cursor:
            cursor.executemany(insert_query, rows)  # 直接插入所有数据
            total_rows_processed = len(rows)  # 假设rows中的数据全部成功处理

        dst_conn.commit()

        current_time = time.time()
        elapsed_time = current_time - start_time1
        logging.info(f"已处理 {total_rows_processed} 行, 已耗时: {elapsed_time:.2f} 秒, 当前表: {dst_table_name}")

    except Exception as e:
        logging.error(f"将数据插入目标表时出现错误: {str(e)}")
        dst_conn.rollback()
        raise


def sync_date(src_db_config, dst_db_config, src_view_name, dst_table_name):
    dst_db = dst_db_config['db']
    try:
        start_time = time.time()
        # 连接源数据库
        with pymysql.connect(**src_db_config) as src_conn:
            # 连接目标数据库
            with pymysql.connect(**dst_db_config) as dst_conn:

                total_batches_counted = count_batches_incremental(src_conn, src_view_name)[0]
                logging.info(f"增量数据预计需要获取 {total_batches_counted} 批数据, 当前表: {src_view_name}")
                # 清空目标表
                clear_table(dst_conn, dst_table_name, dst_db)
                logging.info(f"增量数据目标数据库字段已经清理，重新导入, 当前表: {dst_table_name}")
                try:
                    rows_list, column_names = get_view_data_in_batches(src_conn, src_view_name)
                    # 分批次获取视图数据并插入目标表
                    for idx, rows in enumerate(rows_list, start=1):
                        try:
                            logging.info(f"正在增量数据处理数据，正在处理第 %d 批", idx)
                            batch_insert_into_table(dst_conn, rows, column_names, dst_table_name)
                        except Exception as e:
                            logging.error(
                                f"处理第 {idx} 批增量数据时出现错误: {str(e)}，跳过该批次继续处理下一批次, 当前表: {dst_table_name}")
                            send_wx_work_message('增量数据插入失败')
                except Exception as e:
                    pass
        end_time = time.time()
        total_elapsed_time = end_time - start_time
        logging.info(f"增量数据同步完成! 总共耗时: {total_elapsed_time:.2f} 秒 ,当前表: {dst_table_name} ")

    except Exception as e:
        logging.error(f"在整个增量同步过程中发生了错误: {str(e)} , 当前表: {dst_table_name}")


def sync_date_all(src_db_config, dst_db_config, src_view_name, dst_table_name):
    dst_db = dst_db_config['db']
    try:
        start_time = time.time()
        # 连接源数据库
        with pymysql.connect(**src_db_config) as src_conn:
            # 连接目标数据库
            with pymysql.connect(**dst_db_config) as dst_conn:
                total_batches_counted = count_batches(src_conn, src_view_name)[0]
                logging.info(f"全量数据预计需要获取 {total_batches_counted} 批数据, 当前表: {src_view_name}")
                # 清空目标表
                clear_table(dst_conn, dst_table_name, dst_db)
                logging.info(f"全量目标数据库字段已经清理，重新导入, 当前表: {dst_table_name}")
                rows_list, column_names = get_view_data_in_batches_all(src_conn, src_view_name)
                # 分批次获取视图数据并插入目标表

                for idx, rows in enumerate(rows_list, start=1):
                    try:
                        logging.info(f"正在增量数据处理数据，正在处理第 %d 批", idx)
                        batch_insert_into_table(dst_conn, rows, column_names, dst_table_name)
                    except Exception as e:
                        logging.error(
                            f"处理第 {idx} 批全量数据时出现错误: {str(e)}，跳过该批次继续处理下一批次, 当前表: {dst_table_name}")
                        send_wx_work_message('全量数据插入失败')

        end_time = time.time()
        total_elapsed_time = end_time - start_time
        logging.info(f"全量数据同步完成! 总共耗时: {total_elapsed_time:.2f} 秒 ,当前表: {dst_table_name} ")
    except Exception as e:
        logging.error(f"在整个全量数据同步过程中发生了错误: {str(e)} , 当前表: {dst_table_name}")


def sync_and_print(sync_func, *args):
    sync_func(*args)


def sync_date_dev():
    start_time = time.time()

    # 创建线程列表
    threads = []

    # 创建并启动线程

    #同步增量数据
    for a, b in Incremental_table_list:
        threads.append(threading.Thread(target=sync_and_print, args=(
            sync_date, yewu1_db_config, yewu2_db_config, a, a)))

    #同步全量数据
    for c, d in Full_table_list:
        threads.append(threading.Thread(target=sync_and_print, args=(
            sync_date_all, yewu1_db_config, yewu2_db_config, c, d)))

    # 启动线程
    for thread in threads:
        thread.start()

    # 等待所有线程完成
    for thread in threads:
        thread.join()
    logging.info("所有同步操作已完成")
    end_time = time.time()
    total_elapsed_time = end_time - start_time
    logging.info("dev 环境数据同步完成! 总共耗时: {:.2f} 秒".format(total_elapsed_time))


if __name__ == "__main__":
    # 获取当前时间
    current_time = datetime.now()

    # 将时间格式化为指定的字符串格式
    formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
    logging.info('同步dev环境')
    sync_date_dev()
    logging.info(f"整体同步完成")
    send_wx_work_message('同步完成' + '当期时间:' + formatted_time)

Murphy-dong

关注

4
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
数据库之间数据同步脚本（Python）

最近工作中有研发提出一个需求，从内网的一个Mysql 数据库同步一些表数据到云上的一个Mysql数据库中，同步时还需要增加一些筛选条件，随自己开发了一个Python 脚本，进行数据同步。
复制链接

扫一扫

专栏目录