记录一次hive大表脱敏和改造成parquet存储动态分区的操作

#!/bin/bash

tablename=frontlog
startDate=2018-01-01
#endDate=`date -d "0 day ago" +%Y-%m-%d`
DATE=`date +%Y-%m-%d`
endDate=2018-08-30

#给新表增加分区

while [[ $startDate<=$endDate ]]
do
    echo "startDate:$startDate">>result_dy.log
    hadoop fs -ls /user/hive/warehouse/data_market_tuomin.db/${tablename}/pt_date=${startDate}
    if [ $? -ne 0 ]
    then
          TIME=`date +%H:%M:%S`
          echo "$DATE $TIME the partition pt_date=${startDate} doesn't exists!!">>result_dy.log
    else
         impala-shell --quiet -B -q "create function if not exists params_remove(string) returns string location 'hdfs://dev1.hadoop.feidai.com:8020/user/hive/udf/bigdata.parsejson.jar' symbol='ParseUDF2';"
         echo "impala-shell --quiet -B -q \"insert into table data_market_tuomin.frontlog_hxx PARTITION(part_dt) select dt,vl,nt,tm,amc,re,nw,de,id,dd,fm,dip,_id,mb,bm,ve,createdate,offsetx,sy,fn,createdatestr,dc,sid,ev,dpl,dmb,imei,case when fn in('userRegister','userLogin','findPwdByMobStepTwo','checkLogin','changeMobile','updatePassWord','updateTradeCode','findTradeCodeStepOne','findTradeCodeStepTwo','checkLoginPassWord','checkTradePassWord','submitTradeCodeCash','setTradeCode','setTradePwdSendSms','setTradeCodeStepOne','cashSendSms') and ev='3' then params_remove(sd) else sd end as sd,asx,did,loc,pid,cu,ip,wbn,rs,rt,pr,sp,spid,se,seid,substring(cast(hours_add(from_unixtime(cast(substring(createdate,1,10) as bigint),'yyyy-MM-dd HH:mm:ss'),8) as string),1,7) as part_dt from data_market_tuomin.frontlog where  pt_date='$startDate';\"">>result_dy.log
        TIME=`date +%H:%M:%S`
        echo "$DATE $TIME benging... !!">>result_dy.log
        impala-shell --quiet -B -q "insert into table data_market_tuomin.frontlog_hxx PARTITION(part_dt) \
select dt,vl,nt,tm,amc,re,nw,de,id,dd,fm,dip,_id,mb,bm,ve,createdate,offsetx,sy,fn,createdatestr,dc,sid,ev,dpl,dmb,imei, \
case when fn in('userRegister','userLogin','findPwdByMobStepTwo','checkLogin','changeMobile','updatePassWord','updateTradeCode','findTradeCodeStepOne','findTradeCodeStepTwo','checkLoginPassWord','checkTradePassWord','submitTradeCodeCash','setTradeCode','setTradePwdSendSms','setTradeCodeStepOne','cashSendSms') and ev='3' then params_remove(sd) else sd end as sd,asx,did,loc,pid,cu,ip,wbn,rs,rt,pr,sp,spid,se,seid, \
substring(cast(hours_add(from_unixtime(cast(substring(createdate,1,10) as bigint),'yyyy-MM-dd HH:mm:ss'),8) as string),1,7) as part_dt \
from data_market_tuomin.frontlog \
where  pt_date='$startDate';"
         if [ $? -eq 0 ]
         then
             TIME=`date +%H:%M:%S`
             echo "$DATE $TIME $startDate insert into success!!">>result_dy.log
         else
             TIME=`date +%H:%M:%S`
             echo "$DATE $TIME $startDate insert into failure!!">>result_dy.log
         fi
     fi
     echo "">>result_dy.log
     impala-shell -q "refresh data_market_tuomin.frontlog_hxx;"
     startDate=$(date -d "+1 day $startDate" +%Y-%m-%d)
done

现有一个12TB数据量的表,存储格式为testfile,因为之前的设计问题,分区的区间数据里面不止有这个分区段的数据。

使用impala查询的时候已经查不动了

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值