1,背景简介
最近新入职了一家公司 负责数据仓库的ODS开发,主要下游为mysql oracle sqlserver 等数据库,获取ddl后要在hive上建表将数据从上游系统导入到ods ,手工生成建表语句有些费时费力 想着用execl 模板实现批量生成建表语句,说干就干。
2,excel模版配置
excel模版如下:(sheet页命名:Data_Dic)
3,python代码块
import openpyxl
import os,re
def to_hive_type(type):
type = re.sub('number[(]|numeric[(]','decimal(',type) # number(p,s) or numeric(p,s) to decimal(p,s)
"""
if re.match(r'number[(]',type,flags=0):
return type.replace('number','decimal')
if re.match(r'numeric[(]',type,flags=0):
return type.replace('numeric','decimal')
"""
if type in ['number','numeric']: # number or numeric to 'decimal(32,16)'
return 'decimal(32,16)'
if re.match(r'varchar|char|text|tinytext|longtext|blob|clob|binary',type,flags=0):
return 'string'
elif re.match(r'timestamp|date|datetime',type,flags=0):
return 'timestamp'
elif re.match(r'int|tinyint|smallint',type,flags=0):
return 'int'
elif re.match(r'double|float',type,flags=0):
return 'double'
else:
return type
def excle_2_hiveddl(datebase,excel_file,sqlfile):
wb = openpyxl.load_workbook(f'{excel_file}')
sqlfile = f'{sqlfile}'
# remove if exists sqlfile
if os.path.exists(sqlfile):
os.remove(sqlfile)
ws = wb['Data_Dic'] #specify worksheet name
max_row = ws.max_row
for row in range(2,max_row+1): #first row from row2
table_name = f'{datebase}.' + ws['A'+str(row)].value.lower()
table_cname = ws['B'+str(row)].value
col_name = ws['C'+str(row)].value
col_type = to_hive_type(ws['D'+str(row)].value)
col_cname = ws['E'+str(row)].value
col_comment = f"comment '{col_cname}'"
tab_comment = f"comment '{table_cname}' \r\n partition(dt)\r\n stored as parquet; \r\n"
if ws['A'+str(row)].value == ws['A'+str(row-1)].value: #if table name is the same, not add create table
tb_sql = ''
else:
tb_sql = '''\nDROP TABLE IF EXISTS %s; \nCREATE TABLE %s\n(\n''' % (table_name,table_name)
if ws['A'+str(row)].value != ws['A'+str(row+1)].value:
col_sql = '%s %s %s \n )\n' %(col_name,col_type,col_comment)
else:
col_sql = '%s %s %s \n,' %(col_name,col_type,col_comment)
if row < max_row:
if ws['A'+str(row)].value == ws['A'+str(row+1)].value: #if table name is the same, not add table comments
sql = tb_sql + col_sql
else:
sql = tb_sql + col_sql + tab_comment
else:
sql = tb_sql + col_sql + tab_comment
print(sql)
with open(r'%s' % sqlfile, 'a',encoding = 'utf-8') as sq: #append sql to sqlfile
sq.write(sql)
if __name__ == '__main__':
datebase = 'odss'
work_pash = '/Users/saber/Documents/python/excel/File'
excel_file = f"{work_pash}/table.xlsx"
sqlfile = f'{work_pash}/SQL_DDL.sql'
excle_2_hiveddl(datebase,excel_file, sqlfile)
4,生成建表语句预览
DROP TABLE IF EXISTS odss.cust;
CREATE TABLE odss.cust
(
id bigint comment '主键'
,name string comment '姓名'
,age int comment '年龄'
,adress string comment '住址'
,sex string comment '性别'
,m_aml decimal(16,4) comment '月收入'
)
comment '个人信息'
partition(dt)
stored as parquet;
DROP TABLE IF EXISTS odss.com;
CREATE TABLE odss.com
(
id bigint comment '主键'
,name string comment '姓名'
,scale decimal(19,4) comment '规模'
,adress string comment '地址'
)
comment '公司信息'
partition(dt)
stored as parquet;