把MySQL建表语句往里面放就行
import re
'''
公共内容
'''
original_content = '''PARTITIONED BY (
`pt_platform` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim' = '\\t',
'line.delim' = '\\n',
'serialization.format' = '\\t',
'serialization.null.format' = '')
STORED AS TEXTFILE;'''
'''
取列表的第一个元素
'''
def get_first_element(lst, default_value=""):
if lst:
return lst[0]
else:
return default_value
'''
太邪门,.find("")函数找到返回0,找不到返回-1
ddl语句分析,拆分
'''
def ddl_analyse(ddl):
lines = ddl.splitlines()
# 表名
tbname = ''
# 表正则表达
# tbname_pattern = r'CREATE TABLE `(\w+)`'
tbname_pattern = r"CREATE TABLE `(\w+)`.`(\w+)`"
# 字段列表
columns = []
# 字段正则表达
column_pattern = r'(?<*,|\n)\s*`(\w+)`'
# 注释列表
comments = []
# 注释正则表达
comment_pattern = r"COMMENT \'(.*)\'"
# 类型列表
types = []
# 表注释正则表达
tb_comment_pattern = r"COMMENT='(.*)'"
tb_comment = ''
for line in lines:
line = line.strip()
words = line.split(" ")
if "CREATE" in line.upper() and "TABLE" in line.upper():
tbname = words[len(words)-2]
elif ("NOT NULL" in line.upper() or "DEFAULT" in line.upper()) and "ENGINE=" not in line.upper():
columns.append(words[0])
comments.append(get_first_element(re.findall(comment_pattern, line)))
types.append(change_type(words[1]))
elif "ENGINE=" in line.upper() or "CHARSET=" in line.upper():
tb_comment = get_first_element(re.findall(tb_comment_pattern, line))
return tbname, tb_comment, columns, comments, types
'''
mysql字段类型转化成hive字段类型
# type: mysql => hive sql
'''
def change_type(line):
if re.search(r"int\((\d+)\)", line) != None:
return 'INT'
elif re.search(r"decimal\((\d+),\s*(\d+)\)", line) != None:
return next(re.finditer(r"decimal\((\d+),\s*(\d+)\)", line)).group()
elif re.search(r'varchar', line) != None:
return 'STRING'
elif re.search(r'timestamp', line) != None:
return 'STRING'
elif re.search(r'text', line) != None:
return 'STRING'
elif re.search(r'datetime', line) != None:
return 'STRING'
elif re.search(r'date', line) != None:
return 'STRING'
elif re.search(r'char', line) != None:
return 'STRING'
else:
return str(line)
"""
生成Hive建表语句
:return:
"""
def mysql_to_hsql(ddl):
tbname, tb_comment, columns, comments, types = ddl_analyse(ddl)
hive_ddl = "CREATE TABLE `stage_power`." + str(tbname + "(\n")
add_sql = ''
collect_mysql_query = 'select \n'
ods_hive_query = 'select \n'
for i in range(len(columns)):
if (i == len(columns) - 1):
hive_ddl = hive_ddl + " " + str(columns[i]) + " " + str(
change_type(types[i])) + " COMMENT \'" + str(comments[i]) + "\'\n"
if types[i] == "STRING":
collect_mysql_query = collect_mysql_query + " ifnull(replace(replace({}, CHAR(13), ''), CHAR(10), ''), '')".format(
columns[i]) + '\n'
collect_mysql_query = collect_mysql_query + 'from {}'.format(tbname)
else:
collect_mysql = str(collect_mysql_query + " " + columns[i]) + '\n'
collect_mysql_query = collect_mysql + 'from {}'.format(tbname)
add_sql += "ALTER TABLE {} ADD COLUMNS ({} {} COMMENT '{}') CASCADE;".format(tbname, columns[i], types[i],
comments[i]) + '\n'
ods_hive_query = ods_hive_query + " " +columns[i].replace('`','') + "\n"+"from stage_power."+tbname.replace('`','')
break
hive_ddl = hive_ddl + " " + str(columns[i]) + " " + str(
change_type(types[i])) + " COMMENT \'" + str(comments[i]) + "\',\n"
#print(columns[i]+",")
ods_hive_query = ods_hive_query + " " + columns[i].replace('`','') + ","+ "\n"
if types[i] == "STRING":
collect_mysql_query = collect_mysql_query + " ifnull(replace(replace({}, CHAR(13), ''), CHAR(10), ''), ''),".format(
columns[i]) + '\n'
else:
collect_mysql_query = str(collect_mysql_query + " " + columns[i]) + ',\n'
add_sql += "ALTER TABLE {} ADD COLUMNS ({} {} COMMENT '{}') CASCADE;".format(tbname, columns[i], types[i],
comments[i]) + '\n'
hive_ddl = hive_ddl + ") COMMENT \'" + tb_comment + "\'\n"
return hive_ddl + original_content, add_sql,collect_mysql_query.replace('`',''),ods_hive_query
if __name__ == '__main__':
ddl = '''
CREATE TABLE `_detail_tb` (
`code` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`resettlement_code` int(11) NOT NULL COMMENT '安置合同code',
PRIMARY KEY (`code`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=2945 DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC COMMENT='明细表'
'''
# tbname, tb_comment, columns, comments, types = ddl_analyse(ddl)
hive_ddl, add_sql,collect_mysql,ods_hive_query = mysql_to_hsql(ddl)
print(hive_ddl)
print("=" * 100)
print("=" * 100)
print(add_sql)
print("=" * 100)
print("=" * 100)
print(collect_mysql)
print("=" * 100)
print("=" * 100)
print(ods_hive_query)