debezium同步到doris创建表脚本

最新推荐文章于 2024-09-27 16:14:00 发布
MADLFD
最新推荐文章于 2024-09-27 16:14:00 发布
阅读量87
点赞数 1
文章标签： python
本文链接：https://blog.csdn.net/u011040806/article/details/142551018
版权
import pymysql
import requests
import sys
import json

import pymysql


class ColumnEntity:
    def __init__(self, column_name, ordinal_position, data_type, character_maximum_length, column_type, column_key,
                 column_comment):
        self.column_name = column_name
        self.ordinal_position = ordinal_position
        self.data_type = data_type
        self.character_maximum_length = character_maximum_length
        self.column_type = column_type
        self.column_key = column_key
        self.column_comment = column_comment


class TableInfoEntity:
    def __init__(self, mysql_db, mysql_table, doris_db, doris_table, comment):
        self.mysql_db = mysql_db
        self.mysql_table = mysql_table
        self.doris_db = doris_db
        self.doris_table = doris_table
        self.comment = comment


def table_column_info():
    table_map = {}
    table_schema = "('fe_goods')"  # 要查询的库,多个逗号切分
    table_name = "('sf_group_order_extend')"
    connection = pymysql.connect(host='192.168.56.1', port=3306, user='xxxxxxx',
                                 passwd='xxxxxxxx')
    # connection = pymysql.connect(host='192.168.56.1', port=3306, user='xxxxxxx',
    #                              passwd='xxxxxxxx')
    cursor = connection.cursor()
    sql = ("select table_schema,table_name,column_name,ordinal_position,data_type,character_maximum_length,column_type,"
           "column_key,column_comment from information_schema.columns where table_schema in {0} and table_name in {1}").format(table_schema,table_name)
    cursor.execute(sql)
    table_info = cursor.fetchall()

    for tuple in table_info:
        key = tuple[0] + "." + tuple[1]
        column_entity = ColumnEntity(tuple[2], tuple[3], tuple[4], tuple[5], tuple[6], tuple[7], tuple[8])
        if table_map.__contains__(key):
            values = table_map[key]
            values.append(column_entity)
        else:
            list = []
            list.append(column_entity)
            table_map[key] = list
    # 关闭连接
    cursor.close()
    connection.close()
    return table_map


def mysql_type_convert(data_type, character_maximum_length, column_type):
    # 长度小于100 增加6倍，大于100增加3倍
    if data_type.__eq__('char') or data_type.__eq__('varchar'):
        character_maximum_length = character_maximum_length * 6 if character_maximum_length < 100 else character_maximum_length * 3
        if character_maximum_length > 65533: character_maximum_length = 65530
        data_type = ('char({})'.format(character_maximum_length)) if data_type.__eq__('char') else (
            'varchar({})'.format(character_maximum_length))
    # 这两个字段有精度要求
    if data_type.__eq__('datetime') or data_type.__eq__('decimal'): data_type = column_type
    # 特殊类型替换 为了兼容doris
    s = 'string'
    data_type = (data_type.replace('tinytext', s).replace('mediumtext', s).replace('longtext', s)
                 .replace('tinyblob', s).replace('blob', s).replace('mediumblob', s).replace('longblob', s)
                 .replace('tinystring', s).replace('mediumstring', s).replace('longstring', s)
                 .replace('timestamp', 'datetime').replace('enum', s).replace('set', s)
                 .replace('varbinary', s).replace('binary', s).replace('mediumint', 'int')
                 .replace('year', 'varchar(64)').replace('bit', 'char(10)'))
    if data_type.__eq__('time'): data_type = 'varchar(64)'
    if data_type.__eq__('text'): data_type = s
    return data_type


def batch_mysql_to_doris(info_map, table_map, part_column):
    for key, info_entity in info_map.items():
        doris_db = info_entity.doris_db
        doris_table = info_entity.doris_table
        comment = info_entity.comment
        if table_map.__contains__(key):
            column_list = table_map[key]
            head = 'create table if not exists {}.{} ('.format(doris_db, doris_table)
            body = []
            end = []
            pri_list = []
            first_column_name = '`' + column_list[0].column_name + '`'  # 当前表的第一个字段
            for column_entity in column_list:
                column_name = '`' + column_entity.column_name + '`'
                data_type = column_entity.data_type
                character_maximum_length = column_entity.character_maximum_length
                column_type = column_entity.column_type
                column_key = column_entity.column_key
                column_comment = "'" + column_entity.column_comment + "'"
                # 类型转换,兼容doris
                data_type = mysql_type_convert(data_type, character_maximum_length, column_type)
                # 拼接字段
                value = column_name + '  ' + data_type + '  ' + 'comment ' + column_comment + ','
                # 如果当前字段是主键，就调整顺序
                if column_key.__eq__('PRI'):
                    body.insert(0, value)
                    if len(pri_list) > 0:
                        pri_list.insert(0, column_name)
                    else:
                        pri_list.append(column_name)
                else:
                    body.append(value)
            for column_entity2 in column_list:
                print('"' + column_entity2.column_name + '",')
            # 如果有主键就使用 unique模型,如果没有主键就使用duplicate模型，默认第一个字段当作key
            end.append("ENGINE=OLAP")
            # 可自定义添加相关属性
            if len(pri_list) > 0:
                unique_key = ','.join(pri_list)
                if part_column:
                    end.append("UNIQUE KEY({0},{1})".format(unique_key,part_column))
                else:
                    end.append("UNIQUE KEY({})".format(unique_key))
                end.append('COMMENT "{}"'.format(comment))
                if part_column:
                    end.append('    PARTITION BY RANGE({0})()'.format(part_column))
                end.append('DISTRIBUTED BY HASH({}) BUCKETS AUTO'.format(unique_key))
            else:
                end.append("ENGINE=OLAP")
                if part_column:
                    end.append("DUPLICATE KEY({0},{1})".format(first_column_name,part_column))
                else:
                    end.append("DUPLICATE KEY({})".format(first_column_name))
                if part_column:
                    end.append('    PARTITION BY RANGE({0})()'.format(part_column))
                end.append('COMMENT "{}"'.format(comment))
                end.append('DISTRIBUTED BY HASH({}) BUCKETS AUTO'.format(first_column_name))
            end.append("PROPERTIES (")
            if part_column:
                end.append("\"dynamic_partition.enable\" = \"true\",")
                end.append("\"dynamic_partition.time_unit\" = \"MONTH\",")
                end.append("\"dynamic_partition.create_history_partition\" = \"true\",")
                end.append("\"dynamic_partition.history_partition_num\" = \"80\",")
                end.append("\"dynamic_partition.end\" = \"3\",")
                end.append("\"dynamic_partition.prefix\" = \"p\",")
                end.append("\"dynamic_partition.buckets\" = \"32\",")

            end.append("\"replication_num\" = \"3\"")
            end.append(");")

            # 拼接整体的建表语句
            create_sql = head + '\n' + '\n'.join(body) + '\n)\n' + '\n'.join(end) + '\n'
            print("{}".format(create_sql))
        else:
            print("配置文件有问题,获取不到对应的表 key:{}".format(key))

def create_routine_load(table_schema,table_name,column_list):
    p_list = []
    column_list_new = [x.column_name for x in column_list]
    column_list_new.append('__deleted')
    for x in column_list_new:
        column_name = "\\\"$." + x + "\\\""
        p_list.append(column_name)
    p = ','.join(p_list)

    create_routine = []
    create_routine.append("CREATE ROUTINE LOAD {table_schema}.{table_name} ON  {table_name}".format(table_schema=table_schema,table_name=table_name))
    create_routine.append("WITH MERGE")
    create_routine.append("COLUMNS({}),".format(','.join(column_list_new)))
    create_routine.append("DELETE ON __deleted ='true'")
    create_routine.append("PROPERTIES")
    create_routine.append("(")
    create_routine.append("    \"desired_concurrent_number\"=\"3\",")
    create_routine.append("    \"max_batch_interval\"=\"5\",")
    create_routine.append("    \"max_batch_rows\"=\"300000\",")
    create_routine.append("    \"max_batch_size\"=\"209715200\",")
    create_routine.append("    \"strict_mode\"=\"true\",")
    create_routine.append("    \"format\"=\"json\",")
    create_routine.append("    \"max_batch_size\"=\"209715200\",")
    create_routine.append("    \"jsonpaths\"=\"[{}]\",".format(p))
    create_routine.append("    \"strip_outer_array\"=\"false\"")
    create_routine.append(")")
    create_routine.append("FROM KAFKA")
    create_routine.append("(")
    create_routine.append("    \"kafka_broker_list\"=\"10.181.69.69:9092,10.181.69.136:9092,10.181.70.133:9092\",")
    create_routine.append("    \"kafka_topic\"=\"doris.{table_schema}.{table_name}\",".format(table_schema=table_schema,table_name=table_name))
    create_routine.append("    \"property.group.id\"=\"doris_{table_name}\",".format(table_name=table_name))
    create_routine.append("    \"kafka_partitions\"=\"0\",")
    create_routine.append("    \"kafka_offsets\"=\"0\"")
    create_routine.append(");")
    create_routine_sql = '\n'.join(create_routine)
    print(create_routine_sql)



if __name__ == '__main__':
    table_map = table_column_info()
    create_routine_load('fe_goods','sf_group_order_extend',table_map['fe_goods.sf_group_order_extend'])
    info_map = dict()
    info_map['fe_goods.sf_group_order_extend'] = TableInfoEntity('fe_goods', 'sf_group_order_extend', 'fe_goods', 'sf_group_order_extend', '')
    batch_mysql_to_doris(info_map,table_map,part_column=None)