自己写了一个flume小文件合并脚本,解决hdfs小文件过多问题

flume数据采集之后,本打算按照五分钟滚动一个文件的,但是由于实时性要求提高,改为一分钟一个文件。但是开启了三个flume实例,导致hdfs小文件过多。

1.影响NameNode,

2.影响task数量


决定写一个flume的合并脚本,写着写着,功能越来越多,越来越完善。

合并方案我写过两个:

1.使用spark进行读取合并,优点是scala代码书写,高级语言编程,易于理解维护,但是不方便,每次要打包。

2.使用shell脚本进行getmerger,然后split成128MB大小的文件。

前面一种方式很简单,这里就不列出如何实现了。


#后面代码

shell合并脚本思路:

1. getmerge into tmpfile 

2. split into block file(128MB)

3.put to hdfs

4. check file

5. rm hdfs small file

6. rm local tmpfile 



这段shell,基本上通过变量传值,有一定的shell编程经验才能看懂,通用性蛮强的,具体的有空贴出来,不过使用的时候还是要先做好测试。

#! /bin/sh
# vim /data/executeTask/file_compact/compact_flumedata.sh
# ref: sendmessage.sh
#       1. PROGRAMID    : add id and program desc into programelist.log
#       2. errortimes   : error times
#       3. errorcode    : error code
#       4. DesNo1       : destination phone number

#################### check result function #################### 
checksuccess(){
	project=$1
	projectId=10
	errorCode=$2
	processname=$3
	yesterday=$4
	logfile=$5
	table_name=$6
	phoneNumber=15202125865
	errorTime=1 
	echo " project=$project errorCode=$errorCode processname=$processname yesterday=$yesterday logfile=$logfile table_name=$table_name"

	if [ $errorCode -ne 0 ]; then
		echo "`date +%F\ %T` ${project} errorCode=${errorCode} error process $table_name ${processname} of ${yesterday} data " >> ${logfile}
		/data/executeTask/file_compact/sendmessage.sh $projectId $yesterday $errorCode $phoneNumber
		exit ${errorCode}
	else
		echo "`date +%F\ %T` ${project} errorCode=${errorCode} success process $table_name ${processname} of ${yesterday} data " >> ${logfile}
	fi
}
#################### check result function end #################### 

#################### processCompact function start #################### 
processCompact(){
	table_name=$1
	date=$2
	compact_file=$3
	logfile=$4
	project=$5

	# 0. rm tmp copy data eg. compact.1510051113._COPYING_ created while copy but job failed 
	# TODO rm all compact file 
	hdfs dfs -rm /data/$table_name/$date/compact*_COPYING_ 
	# asume size is measured by GB maybe is mb
	begainsize=`hdfs dfs -du -s -h /data/$table_name/$date/ | awk '{ print $1}' ` 
	
	#errorCode=$?
	#processname=rmCopytmp
	#checksuccess $project $errorCode $processname $date $logfile $table_name
	
	# 1. get to local
	hdfs dfs -getmerge /data/$table_name/$date/*FlumeData* /data/executeTask/file_compact/$compact_file
	errorCode=$?
	processname=getmerge
	checksuccess $project $errorCode $processname $date $logfile $table_name
	
	# split into 128m
	sizeunit=`hdfs dfs -du -s -h /data/$table_name/$date/ | awk '{ print $2}' ` 
	# test
	#begainsize=`hdfs dfs -du -s -h /data/Gps_Log/20171106/ | awk '{ print $1}' `
	#sizeunit=`hdfs dfs -du -s -h /data/Gps_Log/20171106/ | awk '{ print $2}' `
	#if [ $sizeunit = "G" ];then
	#		res=$(printf "%.f" `echo "scale=5;$begainsize*8 "|bc`)
	#else
	#		res=$(printf "%.f" `echo "scale=5;$begainsize/128 "|bc`)  # celling 取整数 http://blog.csdn.net/naiveloafer/article/details/8783518
	#fi
	#echo $res

	if [ $sizeunit = "G" ];then 
		res=$(printf "%.f" `echo "scale=5;$begainsize*8 "|bc`)
	else
		res=$(printf "%.f" `echo "scale=5;$begainsize/128 "|bc`)  # celling 取整数 http://blog.csdn.net/naiveloafer/article/details/8783518
	fi

	cd /data/executeTask/file_compact/
	# split into $res files with number suffix.  reg http://blog.csdn.net/microzone/article/details/52839598
	compact_file_name=$compact_file"_"
	echo "compact_file_name :"$compact_file_name
	split -n l/$res /data/executeTask/file_compact/$compact_file -d -a 3 /data/executeTask/file_compact/${compact_file_name}
	
	#for test
	#res=34
	#compact_file=compact.151018888
	#split -n l/$res /data/executeTask/file_compact/tmp/$compact_file -d -a 3 ${compact_file}"_"
	
	
	# 2 copyFromLocal. 
	hdfs dfs -copyFromLocal /data/executeTask/file_compact/$compact_file_name* /data/$table_name/$date/
	errorCode=$?
	processname=copyFromLocal
	checksuccess $project $errorCode $processname $date $logfile $table_name

	
	# 3. chown this step can be skip 
	hdfs dfs -chown flume /data/$table_name/$date/$compact_file_name*
	errorCode=$?
	processname=chown
	checksuccess $project $errorCode $processname $date $logfile $table_name

	
	# 4
	# deprecated endsize=`hdfs dfs -du -s -h /data/$table_name/$date/ | awk '{ print $1}' `
	endsize=`hdfs dfs -du -s -h /data/$table_name/$date/compact* |  awk '{sum+=$1/1024};END{print sum} ' `
	echo $begainsize  >> $logfile
	echo $endsize >> $logfile

	percent=`echo "scale=2;$begainsize/$endsize" | bc`
	upper=1.2
	lower=0.8
	c1=$(echo "$percent < $upper" | bc)
	c2=$(echo "$percent > $lower" | bc)

	if [ $c1 -eq 1 ] && [ $c2 -eq 1 ];then
		echo " accessable rate"  
	else 
		echo " warn rate" 
		processname=sizerate
		checksuccess $project 1 $processname $date $logfile $table_name
	fi

	
	# 5. rm data after check option
	hdfs dfs -rm /data/$table_name/$date/*FlumeData*
	errorCode=$?
	processname=rmFlumeData
	checksuccess $project $errorCode $processname $date $logfile $table_name

	
	# 6. rm local compact file #  remove local file before getmerge
	rm -rf /data/executeTask/file_compact/$compact_file*
	errorCode=$?
	processname=rmLocalFile
	checksuccess $project $errorCode $processname $date $logfile $table_name
}

#################### processCompact function END #################### 

#################### main start #################### 
#################### config begin #################### 
project=obdAndipda

#the begin day to process 
date=20170216

logfile=/data/job/log/compact_flumedata.log
#the end day to process  
end=`date -d' -1 day' +"%Y%m%d"` 
end=20170216

echo "start: ${date} end: $end"
#################### config end #################### 
while(( $date <= $end ))
do
#################### prepare #################### 
# prepare 
	dealtime=`date +%s`
	compact_file="compact."${dealtime}
	echo "deal date:"${date}
	#################### prepare END #################### 
	table_name="table_name"
	echo "/data/$table_name/$date/"
	processCompact $table_name $date $compact_file $logfile $project $table_name

	date=`date -d"-1 day ago ${date}" +%Y%m%d`

done
echo "complete";
exit 0
#################### main end #################### 



  • 1
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

mtj66

看心情

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值