import pymysql
import requests
import sys
import json
import pymysql
class ColumnEntity:
def __init__(self, column_name, ordinal_position, data_type, character_maximum_length, column_type, column_key,
column_comment):
self.column_name = column_name
self.ordinal_position = ordinal_position
self.data_type = data_type
self.character_maximum_length = character_maximum_length
self.column_type = column_type
self.column_key = column_key
self.column_comment = column_comment
class TableInfoEntity:
def __init__(self, mysql_db, mysql_table, doris_db, doris_table, comment):
self.mysql_db = mysql_db
self.mysql_table = mysql_table
self.doris_db = doris_db
self.doris_table = doris_table
self.comment = comment
def table_column_info():
table_map = {}
table_schema = "('fe_goods')" # 要查询的库,多个逗号切分
table_name = "('sf_group_order_extend')"
connection = pymysql.connect(host='192.168.56.1', port=3306, user='xxxxxxx',
passwd='xxxxxxxx')
# connection = pymysql.connect(host='192.168.56.1', port=3306, user='xxxxxxx',
# passwd='xxxxxxxx')
cursor = connection.cursor()
sql = ("select table_schema,table_name,column_name,ordinal_position,data_type,character_maximum_length,column_type,"
"column_key,column_comment from information_schema.columns where table_schema in {0} and table_name in {1}").format(table_schema,table_name)
cursor.execute(sql)
table_info = cursor.fetchall()
for tuple in table_info:
key = tuple[0] + "." + tuple[1]
column_entity = ColumnEntity(tuple[2], tuple[3], tuple[4], tuple[5], tuple[6], tuple[7], tuple[8])
if table_map.__contains__(key):
values = table_map[key]
values.append(column_entity)
else:
list = []
list.append(column_entity)
table_map[key] = list
# 关闭连接
cursor.close()
connection.close()
return table_map
def mysql_type_convert(data_type, character_maximum_length, column_type):
# 长度小于100 增加6倍,大于100增加3倍
if data_type.__eq__('char') or data_type.__eq__('varchar'):
character_maximum_length = character_maximum_length * 6 if character_maximum_length < 100 else character_maximum_length * 3
if character_maximum_length > 65533: character_maximum_length = 65530
data_type = ('char({})'.format(character_maximum_length)) if data_type.__eq__('char') else (
'varchar({})'.format(character_maximum_length))
# 这两个字段有精度要求
if data_type.__eq__('datetime') or data_type.__eq__('decimal'): data_type = column_type
# 特殊类型替换 为了兼容doris
s = 'string'
data_type = (data_type.replace('tinytext', s).replace('mediumtext', s).replace('longtext', s)
.replace('tinyblob', s).replace('blob', s).replace('mediumblob', s).replace('longblob', s)
.replace('tinystring', s).replace('mediumstring', s).replace('longstring', s)
.replace('timestamp', 'datetime').replace('enum', s).replace('set', s)
.replace('varbinary', s).replace('binary', s).replace('mediumint', 'int')
.replace('year', 'varchar(64)').replace('bit', 'char(10)'))
if data_type.__eq__('time'): data_type = 'varchar(64)'
if data_type.__eq__('text'): data_type = s
return data_type
def batch_mysql_to_doris(info_map, table_map, part_column):
for key, info_entity in info_map.items():
doris_db = info_entity.doris_db
doris_table = info_entity.doris_table
comment = info_entity.comment
if table_map.__contains__(key):
column_list = table_map[key]
head = 'create table if not exists {}.{} ('.format(doris_db, doris_table)
body = []
end = []
pri_list = []
first_column_name = '`' + column_list[0].column_name + '`' # 当前表的第一个字段
for column_entity in column_list:
column_name = '`' + column_entity.column_name + '`'
data_type = column_entity.data_type
character_maximum_length = column_entity.character_maximum_length
column_type = column_entity.column_type
column_key = column_entity.column_key
column_comment = "'" + column_entity.column_comment + "'"
# 类型转换,兼容doris
data_type = mysql_type_convert(data_type, character_maximum_length, column_type)
# 拼接字段
value = column_name + ' ' + data_type + ' ' + 'comment ' + column_comment + ','
# 如果当前字段是主键,就调整顺序
if column_key.__eq__('PRI'):
body.insert(0, value)
if len(pri_list) > 0:
pri_list.insert(0, column_name)
else:
pri_list.append(column_name)
else:
body.append(value)
for column_entity2 in column_list:
print('"' + column_entity2.column_name + '",')
# 如果有主键就使用 unique模型,如果没有主键就使用duplicate模型,默认第一个字段当作key
end.append("ENGINE=OLAP")
# 可自定义添加相关属性
if len(pri_list) > 0:
unique_key = ','.join(pri_list)
if part_column:
end.append("UNIQUE KEY({0},{1})".format(unique_key,part_column))
else:
end.append("UNIQUE KEY({})".format(unique_key))
end.append('COMMENT "{}"'.format(comment))
if part_column:
end.append(' PARTITION BY RANGE({0})()'.format(part_column))
end.append('DISTRIBUTED BY HASH({}) BUCKETS AUTO'.format(unique_key))
else:
end.append("ENGINE=OLAP")
if part_column:
end.append("DUPLICATE KEY({0},{1})".format(first_column_name,part_column))
else:
end.append("DUPLICATE KEY({})".format(first_column_name))
if part_column:
end.append(' PARTITION BY RANGE({0})()'.format(part_column))
end.append('COMMENT "{}"'.format(comment))
end.append('DISTRIBUTED BY HASH({}) BUCKETS AUTO'.format(first_column_name))
end.append("PROPERTIES (")
if part_column:
end.append("\"dynamic_partition.enable\" = \"true\",")
end.append("\"dynamic_partition.time_unit\" = \"MONTH\",")
end.append("\"dynamic_partition.create_history_partition\" = \"true\",")
end.append("\"dynamic_partition.history_partition_num\" = \"80\",")
end.append("\"dynamic_partition.end\" = \"3\",")
end.append("\"dynamic_partition.prefix\" = \"p\",")
end.append("\"dynamic_partition.buckets\" = \"32\",")
end.append("\"replication_num\" = \"3\"")
end.append(");")
# 拼接整体的建表语句
create_sql = head + '\n' + '\n'.join(body) + '\n)\n' + '\n'.join(end) + '\n'
print("{}".format(create_sql))
else:
print("配置文件有问题,获取不到对应的表 key:{}".format(key))
def create_routine_load(table_schema,table_name,column_list):
p_list = []
column_list_new = [x.column_name for x in column_list]
column_list_new.append('__deleted')
for x in column_list_new:
column_name = "\\\"$." + x + "\\\""
p_list.append(column_name)
p = ','.join(p_list)
create_routine = []
create_routine.append("CREATE ROUTINE LOAD {table_schema}.{table_name} ON {table_name}".format(table_schema=table_schema,table_name=table_name))
create_routine.append("WITH MERGE")
create_routine.append("COLUMNS({}),".format(','.join(column_list_new)))
create_routine.append("DELETE ON __deleted ='true'")
create_routine.append("PROPERTIES")
create_routine.append("(")
create_routine.append(" \"desired_concurrent_number\"=\"3\",")
create_routine.append(" \"max_batch_interval\"=\"5\",")
create_routine.append(" \"max_batch_rows\"=\"300000\",")
create_routine.append(" \"max_batch_size\"=\"209715200\",")
create_routine.append(" \"strict_mode\"=\"true\",")
create_routine.append(" \"format\"=\"json\",")
create_routine.append(" \"max_batch_size\"=\"209715200\",")
create_routine.append(" \"jsonpaths\"=\"[{}]\",".format(p))
create_routine.append(" \"strip_outer_array\"=\"false\"")
create_routine.append(")")
create_routine.append("FROM KAFKA")
create_routine.append("(")
create_routine.append(" \"kafka_broker_list\"=\"10.181.69.69:9092,10.181.69.136:9092,10.181.70.133:9092\",")
create_routine.append(" \"kafka_topic\"=\"doris.{table_schema}.{table_name}\",".format(table_schema=table_schema,table_name=table_name))
create_routine.append(" \"property.group.id\"=\"doris_{table_name}\",".format(table_name=table_name))
create_routine.append(" \"kafka_partitions\"=\"0\",")
create_routine.append(" \"kafka_offsets\"=\"0\"")
create_routine.append(");")
create_routine_sql = '\n'.join(create_routine)
print(create_routine_sql)
if __name__ == '__main__':
table_map = table_column_info()
create_routine_load('fe_goods','sf_group_order_extend',table_map['fe_goods.sf_group_order_extend'])
info_map = dict()
info_map['fe_goods.sf_group_order_extend'] = TableInfoEntity('fe_goods', 'sf_group_order_extend', 'fe_goods', 'sf_group_order_extend', '')
batch_mysql_to_doris(info_map,table_map,part_column=None)
debezium同步到doris创建表脚本
最新推荐文章于 2024-09-27 16:14:00 发布