批量迁移MySQL历史数据到历史库【Python脚本】


#!/usr/bin/env python
# coding:utf-8
__author__ = 'John'

import MySQLdb
import sys
import datetime
import time


class ArgsError(Exception):
    def __init__(self):
        print ( "\nUsage : %s 2012 02\n") % sys.argv[0]


class ClassMigrate(object):
    """
    1、前提:
        a.target_db需要有个模板表,target_tab将根据模板表创建
        b.如果按月迁移,则需要输入年份和月份,对应self.year = sys.argv[1],self.month = sys.argv[2]参数
    """
    def __init__(self):
        #self.year = sys.argv[1]
        self.year = '2016'
        self.leap = int(self.year) % 4
        self.year_length = len(self.year)
        #self.month = sys.argv[2]
        self.month = '03'
        self.mon_length = len(self.month)

        self.source_host = '192.168.1.41'
        self.source_user = 'root'
        self.source_password = 'xxx'
        self.source_port = 53306
        self.source_db = 'archive_db'
        self.source_conn_str = MySQLdb.connect(host=self.source_host, port=self.source_port, user=self.source_user, passwd=self.source_password, db=self.source_db, charset='utf8')
        self.source_conn_str.autocommit(True)
        self.source_tab = self.source_db + '.t_hist_201603_innodb'
        
        self.target_db = 'test'
        self.template_tab = self.target_db + '.t_hist_template_archive'
        self.target_tab = self.target_db + '.t_hist_' + self.year + self.month

        """ 目标库配置信息只是在需要迁移到远程数据库的时候才需要配置,如果迁移到本地库,则不会读取目标库的配置 """
        self.target_host = '192.168.1.220'
        self.target_user = 'root'
        self.target_password = 'xxx'
        self.target_port = 53306
        self.target_conn_str = MySQLdb.connect(host=self.target_host, port=self.target_port, user=self.target_user, passwd=self.target_password, db=self.target_db, charset='utf8')
        self.target_conn_str.autocommit(True)
        
        self.pk = 'auto_id'
        self.date_col = 'ut'
        self.step_size = 20000

        """_migState默认为False,不要更改。migMethodSelected用来选择迁移方式。
        deleteMethodSelected用于选择源库删除方式,
            !! 请谨慎选择delete方式,因为本程序会分批次删除数据并提交  !!
        而'drop daily partition'则是针对每天一个分区表的,只输出drop partition语句,并不执行"""
        self._migState = False
        self.migMethod = ('monthly_2_local', 'monthly_2_remote', 'whole_2_local', 'whole_2_remote')
        self.migMethodSelected = self.migMethod[3]
        self.deleteMethod = ('delete', 'drop daily partition')
        self.deleteMethodSelected = self.deleteMethod[1]

        self.source_cnt = ''
        self.source_min_id = ''
        self.source_max_id = ''

    def source_query(self, sql, sql_type):
        try:
            cr = self.source_conn_str.cursor()
            cr.execute(sql)
            if sql_type == 'select':
                return cr.fetchall()
            elif sql_type == 'dml':
                rows = self.source_conn_str.affected_rows()
                self.source_conn_str.commit()
                return rows
            else:
                return True
        except Exception, e:
            print (str(e))
            return False
        finally:
            cr.close()
            
    def target_query(self, sql, sql_type, values=''):
        try:
            cr = self.target_conn_str.cursor()
            if sql_type == 'select':
                cr.execute(sql)
                return cr.fetchall()
            elif sql_type == 'insertmany':
                cr.executemany(sql, values)
                rows = self.target_conn_str.affected_rows()
                self.target_conn_str.commit()
                return rows
            else:
                cr.execute(sql)
                return True
        except Exception, e:
            print (str(e))
            return False
        finally:
            cr.close()            

    def get_day_end(self):
        try:
            if self.year_length != 4 or self.mon_length != 2:
                raise ArgsError
    
            if self.month in ('01', '03', '05', '07', '08', '10', '12'):
                    self.day_end = 31
            else:
                    self.day_end = 30
            if self.leap == 0:
                if self.month == '02':
                    self.day_end = 29
            else:
                if self.month == '02':
                    self.day_end = 28
        except Exception, e:
            print (str(e))

    def get_min_max(self, which_db, time_range_sql=''):
        """  无论哪种迁移方式,都会执行本函数创建目标、并获取源表需要迁移的总条数、最小id、最大id
        :param which_db: 选择在哪个库执行create table SQL
        :param time_range_sql: 附带的时间限制语句
        """
        try:
            dbs = {'source_query': self.source_query, 'target_query': self.target_query}
            print ("\nStarting Migrate at --  %s") % (datetime.datetime.now().__str__())
            sql = 'CREATE TABLE %s like %s' % (self.target_tab, self.template_tab)
            if dbs[which_db](sql, 'ddl'):
                pass
            else:
                raise Exception

            sql = "select count(*),IFNULL(min(%s),-1),IFNULL(max(%s),-1) from %s" % (self.pk, self.pk,self.source_tab)
            sql += time_range_sql
            q = self.source_query(sql, 'select')
            self.source_cnt = q[0][0]
            self.source_min_id = q[0][1]
            self.source_max_id = q[0][2]
            return True
        except Exception, e:
            return False
            print (str(e))

    def migrate_2_local(self, time_range_sql=''):
        try:
            k = self.source_min_id
            if self.source_cnt != 0  and self.source_min_id != None and self.source_max_id != None:
                while k <= self.source_max_id:
                    sql = "insert into %s select id, outcomeFK, odds_providerFK, odds, odds_old, active, is_back, is_single, is_live, volume, currency, couponKey, del, n, ut, ADD_TIME, NULL from %s where %s>=%d and %s<%d " % (self.target_tab, self.source_tab, self.pk, k, self.pk, k+self.step_size)
                    sql += time_range_sql.replace('where', 'and')
                    print ("\n %s") % sql
                    starttime = datetime.datetime.now()
                    rows = self.source_query(sql, 'dml')
                    print ("Inserted %s rows") % str(rows)
                    endtime = datetime.datetime.now()
                    timeinterval = endtime - starttime
                    print("Elapsed :" + str(timeinterval.seconds) + '.' + str(timeinterval.microseconds) + " seconds")
                    k += self.step_size
        except Exception, e:
            print (str(e))
        finally:
            print ("\nInsert complete Migration at -- %s") % (datetime.datetime.now().__str__())
            
    def migrate_2_remote(self, time_range_sql=''):
        try:
            k = self.source_min_id
            if self.source_cnt != 0 and self.source_min_id != None and self.source_max_id != None:
                while k <= self.source_max_id:
                    sql = "select id, outcomeFK, odds_providerFK, odds, odds_old, active, is_back, is_single, is_live, volume, currency, couponKey, del, n, ut, ADD_TIME, NULL from %s where %s>=%d and %s<%d " % (self.source_tab, self.pk, k, self.pk, k+self.step_size)
                    orderby = " order by %s" % self.pk
                    sql += time_range_sql.replace('where', 'and')
                    sql += orderby
                    print ("\n %s") % sql
                    starttime = datetime.datetime.now()
                    results = self.source_query(sql, 'select')
                    sql = "insert into " + self.target_tab + " values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                    rows = self.target_query(sql, 'insertmany', results)
                    if rows == False:
                        print ("Insert failed!!")
                    else:
                        print ("Inserted %s rows.") % rows
                    endtime = datetime.datetime.now()
                    timeinterval = endtime - starttime
                    print("Elapsed :" + str(timeinterval.seconds) + '.' + str(timeinterval.microseconds) + " seconds")
                    k += self.step_size
        except Exception, e:
            print (str(e))
        finally:
            print ("\nInsert complete Migration at -- %s") % (datetime.datetime.now().__str__())
            
    def delete_data(self, time_range_sql=''):
        try:
            k = self.source_min_id
            if self.source_cnt != 0 and self.source_min_id != None and self.source_max_id != None:
                while k <= self.source_max_id:
                    sql = "delete from %s where %s>=%d and %s<%d " % (self.source_tab, self.pk, k, self.pk, k+self.step_size)
                    sql += time_range_sql.replace('where', 'and')
                    print ("\n %s") % sql
                    starttime = datetime.datetime.now()
                    rows = self.source_query(sql, 'dml')
                    if rows == False:
                        print ("Delete failed!!")
                    else:
                        print ("Deleted %s rows.") % rows
                    endtime = datetime.datetime.now()
                    timeinterval = endtime - starttime
                    print("Elapsed :" + str(timeinterval.seconds) + '.' + str(timeinterval.microseconds) + " seconds")
					time.sleep(1)
                    k += self.step_size
        except Exception, e:
            print (str(e))
        finally:
            print ("\nDelete complete Migration at -- %s") % (datetime.datetime.now().__str__())            

    def verify_total_cnt(self, which_db, time_range_sql=''):
        try:
            dbs = {'source_query': self.source_query, 'target_query': self.target_query}
            sql = "select count(*) from %s " % (self.source_tab)
            sql += time_range_sql
            source_total_cnt = self.source_query(sql, 'select')
            print ("\n")
            print ("====="*5)
            print ("source_total_cnt: %s") % source_total_cnt[0][0]
            sql = "select count(*) from %s " % (self.target_tab)
            sql += time_range_sql
            target_total_cnt = dbs[which_db](sql, 'select')
            print ("target_total_cnt: %s") % target_total_cnt[0][0]
            print ("====="*5)

            if source_total_cnt == target_total_cnt and source_total_cnt[0][0] != 0 and target_total_cnt[0][0] != 0:
                print ("\nFinal result: Successfully!!")
                self._migState = True
        except Exception, e:
            print (str(e))

    def drop_daily_partition(self, which_db):
        try:
            dbs = {'source_query': self.source_query, 'target_query': self.target_query}
            if self._migState:
                for i in range(1, self.day_end+1):
                    day = str(i)
                    if i < 10:
                        day = '0' + str(i)
                    sql = "select count(*) from %s where %s >='%s-%s-%s 00:00:00'  and %s <= '%s-%s-%s 23:59:59'" % (self.source_tab, self.date_col, self.year, self.month, day, self.date_col, self.year, self.month, day)
                    source_day_cnt = self.source_query(sql, 'select')
                    sql = "select count(*) from %s where %s >='%s-%s-%s 00:00:00'  and %s <= '%s-%s-%s 23:59:59'" % (self.target_tab, self.date_col, self.year, self.month, day, self.date_col, self.year, self.month, day)
                    target_day_cnt = dbs[which_db](sql, 'select')
                    if source_day_cnt == target_day_cnt:
                        print ("alter table %s drop partition p%s%s%s;") % (self.source_tab, self.year, self.month, day)
                    else:
                        print("\nsource_day_cnt: %s") % source_day_cnt[0][0]
                        print("target_day_cnt: %s") % target_day_cnt[0][0]
                        break
        except Exception, e:
            print (str(e))

f = ClassMigrate()
if f.migMethodSelected in ('monthly_2_local', 'monthly_2_remote'):
    f.get_day_end()
    time_range_sql = " where %s >='%s-%s-01 00:00:00'  and %s <= '%s-%s-%s 23:59:59'" % (f.date_col, f.year, f.month, f.date_col, f.year, f.month, f.day_end)
    if f.migMethodSelected == 'monthly_2_local':
        if f.get_min_max('source_query', time_range_sql):
            f.migrate_2_local(time_range_sql)
            f.verify_total_cnt('source_query', time_range_sql)
            if f.deleteMethodSelected == 'delete':
                f.delete_data(time_range_sql)
            else:
                f.drop_daily_partition('source_query')
        else:
            print ("Cannot get count, min_id, max_id!  Exit.")
        exit()
    else:
        if f.get_min_max('target_query', time_range_sql):
            f.migrate_2_remote(time_range_sql)
            f.verify_total_cnt('target_query', time_range_sql)
            if f.deleteMethodSelected == 'delete':
                f.delete_data(time_range_sql)
            else:
                f.drop_daily_partition('target_query')
        else:
            print ("Cannot get count, min_id, max_id!  Exit.")
            exit()
elif f.migMethodSelected in ('whole_2_local', 'whole_2_remote'):
    if f.migMethodSelected == 'whole_2_local':
        if f.get_min_max('source_query'):
            f.migrate_2_local()
            f.verify_total_cnt('source_query')
        else:
            print ("Cannot get count, min_id, max_id!  Exit.")
        exit()
    else:
        if f.get_min_max('target_query'):
            f.migrate_2_remote()
            f.verify_total_cnt('target_query')
        else:
            print ("Cannot get count, min_id, max_id!  Exit.")
            exit()
else:
    print ("self.migMethodSelected must in ('monthly_2_local', 'monthly_2_remote', 'whole_2_local', 'whole_2_remote') !")


好的,我可以为您提供一些关于使用DataX进行MySQL到Hive数据迁移的指导。 首先,您需要安装DataX并配置好环境。然后,您需要编写一个DataX任务配置文件,以指定源和目标数据源的详细信息。 以下是一个简单的MySQL到Hive的DataX任务配置文件示例: ``` { "job": { "content": [ { "reader": { "name": "mysqlreader", "parameter": { "username": "your_mysql_username", "password": "your_mysql_password", "column": [ "column1", "column2", "column3" ], "connection": [ { "jdbcUrl": "jdbc:mysql://your_mysql_server:3306/your_mysql_database", "table": [ "your_mysql_table" ] } ] } }, "writer": { "name": "hdfswriter", "parameter": { "defaultFS": "hdfs://your_hdfs_server:9000", "fileType": "text", "path": "/your_hdfs_path", "fileName": "your_hdfs_file_name", "fieldDelimiter": "\t", "writeMode": "append", "column": [ "column1", "column2", "column3" ] } } } ], "setting": { "speed": { "channel": "3" } } } } ``` 在上面的配置文件中,您需要将以下信息进行替换: - `your_mysql_username`:MySQL数据的用户名。 - `your_mysql_password`:MySQL数据的密码。 - `your_mysql_server`:MySQL数据的服务器地址。 - `your_mysql_database`:MySQL数据的名称。 - `your_mysql_table`:要迁移MySQL的名称。 - `your_hdfs_server`:HDFS服务器的地址。 - `your_hdfs_path`:HDFS中要写入数据的路径。 - `your_hdfs_file_name`:在HDFS中要写入的文件名。 - `column1`,`column2`和`column3`:要迁移的列名称。 完成配置文件后,您可以使用以下命令来执行DataX任务: ``` python datax.py your_job_config.json ``` 这将启动DataX并开始将MySQL中的数据传输到Hive中。 希望这可以帮助您进行MySQL到Hive的数据迁移。如果您有任何其他问题,请随时问我。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值