使用shell调用hive sql实现hive表小文件合并

最新推荐文章于 2023-04-24 15:51:36 发布

weixin_42094092

最新推荐文章于 2023-04-24 15:51:36 发布

阅读量781

点赞数

分类专栏： linux shell 文章标签： hive shell

本文链接：https://blog.csdn.net/weixin_42094092/article/details/107519988

版权

linux shell 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

之前文章写到用sparksql实现hive表小文件合并,这里使用一下用shell调用hivesql来合并hive表文件,思路是一样的

#!/bin/bash
source /etc/profile
###################################################################################
#purpose:hive 表批量合并小文件
#author：
#date：2018/08/11
#date: 2019/03/08 由原来循环串行跑更新为通过并行掉函数，并行跑
###################################################################################
runMerge()
{
tbs=$1
pcdate=`date -d"1 day ago" +"%Y%m%d"`
echo ">>>>>>>>>>$tbs 合并分区日期"\'$pcdate\'

for tb in $tbs
do
    tmp_tb=$tb'_tmp'
    #构建字段列表
    a1=`hive -e "desc $tb"`    
    echo $a1
    a=`hive -e "desc $tb"|awk 'NR==1,NR==$(NF-0){print $1}'`
    echo "$tb 字段列表：$a"
    b=`echo $a|sed 's/day # # day/ /g'`
    c1=`echo $b|sed 's/ /,/g'`
    c=`echo $c1|sed 's/timestamp/\`timestamp\`/g'`	
    for i in \'$pcdate\'
    do
	echo "开始合并$tb....."
        echo "select $c from $tb where day = ${i}"
        hive -e "
        set hive.merge.mapfiles = true;
        set hive.merge.mapredfiles = true;
        set hive.merge.tezfiles=true;
        set hive.merge.size.per.task = 256000000;
        set hive.merge.smallfiles.avgsize=16000000 ;
        -- 将小文件数据先放入一个临时分区
        insert overwrite table $tb partition (day = '19700101')
        select $c from $tb where day = ${i};
        -- 将原来的小文件分区删掉
        alter table $tb drop if exists partition(day = ${i});
        -- 将临时分区，rename到正式大文件分区
        alter table $tb partition (day = '19700101') rename to partition(day = ${i});
        -- 将临时分区删掉
        alter table $tb drop if exists partition(day = '19700101');
        "
        if [ $? != 0 ];then
           currenttime=`date +'%Y-%m-%d-%H:%M:%S'`
	   msg="[$currenttime]$tb小文件合并出问题"
           sh /opt/edw/bin/oozie-job-dingtalk-warn.sh $msg
	   break
        fi
	echo $tb"小文件合并完成..."
        #将表名替换成表路径
        tbfilepath=${tb//'.'/'.db/'}
        fileday=${i//"'"/""}
        #删除合并过程中产生中间文件夹
        hadoop fs -test -e /user/hive/warehouse/$tbfilepath/day=$fileday/.hive-staging_hive_*
        if [ $? == 0 ] 
	  then  hadoop fs -rm -r /user/hive/warehouse/$tbfilepath/day=$fileday/.hive-staging_hive_*
   	fi
   done
done
}
#并行跑
starttime=`date +'%Y-%m-%d %H:%M:%S'`
runMerge 'databasename1.tablename1' &
runMerge 'databasename2.tablename2' &
runMerge 'databasename3.tablename3' 
wait
endtime=`date +'%Y-%m-%d %H:%M:%S'`

start_seconds=$(date --date="$starttime" +%s)
end_seconds=$(date --date="$endtime" +%s)
echo "$starttime $endtime $sqlscript本次运行时间： "$((end_seconds-start_seconds))"s"