def hive2mysql(hivedatabase,hiveTableName,mysqldatabase,mysqlTableName,updateDaysSql):
logDir = “%s/%s”%(logPath, hiveTableName)
if not os.path.isdir(logDir):
os.mkdir(logDir)
logFile = “%s/%s”%(logDir, hiveTableName)
log = PythonMode.getTimeRotatingLogger(hiveTableName, logFile, logging.DEBUG, )
hiveTmptableName=hiveTableName+’_dtemp’
#查询发生更新的分区
execHive = "hive -e"
updateDays = PythonMode.hiveQuery(host, username, password, databases, updateDaysSql)
#拼接为查询条件字符串
updateDaysStr=' ('
updateDaysStrForMysqlDelete=''
count=0
for day in updateDays:
count+=1
updateDaysStr=updateDaysStr+str(day)[2:][:-2]+','
updateDaysStrForMysqlDelete=updateDaysStrForMysqlDelete+'\\\''+str(day)[3:][:-3]+'\\\''+','
updateDaysStr=updateDaysStr[:-1]+') '
print("updateDaysStr="+updateDaysStr)
log.info("updateDaysStr="+updateDaysStr)
print('共更新'+str(count)+'天的数据')
log.info('共更新'+str(count)+'天的数据')
#获取mysql目标表的所有字段名
mysqlColumnsSql="select COLUMN_NAME from information_schema.COLUMNS where table_name = \'%s\'"%(mysqlTableName)
mysqlColumns=PythonMode.mysqlConnect2(host1, username1, password1, databases1, mysqlColumnsSql)
#拼接为json中的mysql列
mysqlColumnsStr=''
for column in mysqlColumns:
mysqlColumnsStr+='"'+str(column)[3:][:-3].lower()+'",'
mysqlColumnsStr=mysqlColumnsStr[:-1]
print("mysqlColumnsStr="+mysqlColumnsStr)
log.info("mysqlColumnsStr="+mysqlColumnsStr)
#获取原始hive表中的所有字段
hiveColumnsSql="SHOW COLUMNS IN %s.%s"%(hivedatabase,hiveTableName)
hiveColumns=PythonMode.hiveQuery(host, username, password, databases2, hiveColumnsSql)
#根据mysql中的字段,选取出临时表需要的列
selectColumns=''
mysqlColumnsList=mysqlColumnsStr.split(",")
for column in mysqlColumnsList:
for hiveColumn in hiveColumns:
if str(hiveColumn)[3:][:-3]==column[1:][:-1]:
selectColumns+=column[1:][:-1]+', '
selectColumns=selectColumns[:-2]
if len(mysqlColumnsStr.split(','))!=len(selectColumns.split(',')):
print('hive临时表中字段与目标mysql表中字段不一致,请检查后手动输入!!hive临时表字段为'+selectColumns+' 目标mysql表字段为'+mysqlColumnsStr)
log.info('hive临时表中字段与目标mysql表中字段不一致,请检查后手动输入!!hive临时表字段为'+selectColumns+' 目标mysql表字段为'+mysqlColumnsStr)
exit(-1)
#根据所需列创建临时表,并导入数据
createTableSql=''' hive -e "drop table if exists %s.%s;
create table %s.%s as select %s from %s.%s where ppi in %s;"
'''%(hivedatabase,hiveTmptableName,hivedatabase,hiveTmptableName, selectColumns, hivedatabase,hiveTableName,updateDaysStr)
print('createTableSql='+createTableSql)
log.info('createTableSql='+createTableSql)
createResult=PythonMode.execShell(createTableSql);
if createResult[0] is None:
log.error("createResult_Failure!!! 错误信息:%s"%(createResult[1]))
exit(-1)
else:
log.info("createResult_Success!!!")
print("createResult_Success!!!")
# 根据查询出来的更新分区,先删除mysql中历史数据
#mysql = "delete from %s where ppi in %s"%(mysqlTableName,updateDaysStr)
#PythonMode.mysqlConnect(host1, username1, password1, databases1, mysql)
print('用datax工具从hive的%s导入数据到mysql中的%s'%(hiveTmptableName,mysqlTableName))
log.info('用datax工具从hive的%s导入数据到mysql中的%s'%(hiveTmptableName,mysqlTableName))
mysqlColumnsStr=mysqlColumnsStr[1:][:-1]
mysqlColumnsStr=mysqlColumnsStr.replace('"','\\\\\\"')
updateDaysStrForMysqlDelete=updateDaysStrForMysqlDelete[:-1]
print('updateDaysStrForMysqlDelete=='+updateDaysStrForMysqlDelete)
datax='/alidata/datax/bin/datax.py --jvm="-Xms2G -Xmx2G" /opt/jar/azkaban/config/datax2mysqlGeneral.json -p\"-Ddatabase=%s -DhiveTmpTableName=%s -Dmysqldatabase=%s -DmysqlTableName=%s -DmysqlColumns=%s -DmysqlColumns=%s -DupdateDays=%s \"'%(hivedatabase,hiveTmptableName,mysqldatabase,mysqlTableName,mysqlColumnsStr,mysqlColumnsStr,updateDaysStrForMysqlDelete)
print('dataxshell='+datax)
dataxResult=PythonMode.execShell(datax)
if dataxResult[0] is None:
log.error("dataxResult_Failure!!! 错误信息:%s"%(dataxResult[1]))
exit(-1)
else:
log.info("dataxResult_Success!!!")
print("dataxResult_Success!!!")
#删除临时表
dropTableSql=''' hive -e "drop table if exists %s.%s;"'''%(hivedatabase,hiveTmptableName)
dropTableResult=PythonMode.execShell(dropTableSql);
if dropTableResult[0] is None:
log.error("dropTableResult_Failure!!! 错误信息:%s"%(createResult[1]))
exit(-1)
else:
log.info("dropTableResult_Success!!!")
print("dropTableResult_Success_Success!!!")
return 'success'
对应json:
{
“job”: {
“setting”: {
“speed”: {
“channel”: 8
},
“errorLimit”: {
“record”: 0,
“percentage”: 0.02
}
},
“content”: [
{
“reader”: {
“name”: “hdfsreader”,
“parameter”: {
“path”: “hdfs://nameservice1//user/hive/warehouse/
d
a
t
a
b
a
s
e
.
d
b
/
{database}.db/
database.db/{hiveTmpTableName}/",
“hadoopConfig”:{
“dfs.nameservices”: “nameservice1”,
“dfs.ha.namenodes.nameservice1”: “namenode1,namenode2”,
“dfs.namenode.rpc-address.nameservice1.namenode1”: “##:8020”,
“dfs.namenode.rpc-address.nameservice1.namenode2”: “##:8020”,
“dfs.client.failover.proxy.provider.nameservice1”: “org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider”
},
“defaultFS”: “hdfs://nameservice1”,
“column”:["”],
“fileType”: “text”,
“encoding”: “UTF-8”,
“fieldDelimiter”: “\u0001”,
“nullFormat”: “\N”
}
},
"writer": {
"name": "mysqlwriter",
"parameter": {
"writeMode": "insert",
"username": "##",
"password": "##",
"batchSize": 2048,
"column": ["${mysqlColumns}"],
"session": [
],
"preSql": [
"delete from ${mysqlTableName} where ppi in (${updateDays})"
],
"connection": [
{
"jdbcUrl": "jdbc:mysql://##/${mysqldatabase}?useUnicode=true&characterEncoding=utf8",
"table": [
"${mysqlTableName}"
]
}
]
}
}
}
]
}
}