之前文章写到用sparksql实现hive表小文件合并,这里使用一下用shell调用hivesql来合并hive表文件,思路是一样的
#!/bin/bash
source /etc/profile
###################################################################################
#purpose:hive 表批量合并小文件
#author:
#date:2018/08/11
#date: 2019/03/08 由原来循环串行跑更新为通过并行掉函数,并行跑
###################################################################################
runMerge()
{
tbs=$1
pcdate=`date -d"1 day ago" +"%Y%m%d"`
echo ">>>>>>>>>>$tbs 合并分区日期"\'$pcdate\'
for tb in $tbs
do
tmp_tb=$tb'_tmp'
#构建字段列表
a1=`hive -e "desc $tb"`
echo $a1
a=`hive -e "desc $tb"|awk 'NR==1,NR==$(NF-0){print $1}'`
echo "$tb 字段列表:$a"
b=`echo $a|sed 's/day # # day/ /g'`
c1=`echo $b|sed 's/ /,/g'`
c=`echo $c1|sed 's/timestamp/\`timestamp\`/g'`
for i in \'$pcdate\'
do
echo "开始合并$tb....."
echo "select $c from $tb where day = ${i}"
hive -e "
set hive.merge.mapfiles = true;
set hive.merge.mapredfiles = true;
set hive.merge.tezfiles=true;
set hive.merge.size.per.task = 256000000;
set hive.merge.smallfiles.avgsize=16000000 ;
-- 将小文件数据先放入一个临时分区
insert overwrite table $tb partition (day = '19700101')
select $c from $tb where day = ${i};
-- 将原来的小文件分区删掉
alter table $tb drop if exists partition(day = ${i});
-- 将临时分区,rename到正式大文件分区
alter table $tb partition (day = '19700101') rename to partition(day = ${i});
-- 将临时分区删掉
alter table $tb drop if exists partition(day = '19700101');
"
if [ $? != 0 ];then
currenttime=`date +'%Y-%m-%d-%H:%M:%S'`
msg="[$currenttime]$tb小文件合并出问题"
sh /opt/edw/bin/oozie-job-dingtalk-warn.sh $msg
break
fi
echo $tb"小文件合并完成..."
#将表名替换成表路径
tbfilepath=${tb//'.'/'.db/'}
fileday=${i//"'"/""}
#删除合并过程中产生中间文件夹
hadoop fs -test -e /user/hive/warehouse/$tbfilepath/day=$fileday/.hive-staging_hive_*
if [ $? == 0 ]
then hadoop fs -rm -r /user/hive/warehouse/$tbfilepath/day=$fileday/.hive-staging_hive_*
fi
done
done
}
#并行跑
starttime=`date +'%Y-%m-%d %H:%M:%S'`
runMerge 'databasename1.tablename1' &
runMerge 'databasename2.tablename2' &
runMerge 'databasename3.tablename3'
wait
endtime=`date +'%Y-%m-%d %H:%M:%S'`
start_seconds=$(date --date="$starttime" +%s)
end_seconds=$(date --date="$endtime" +%s)
echo "$starttime $endtime $sqlscript本次运行时间: "$((end_seconds-start_seconds))"s"