在flink,sparkstreaming等实时任务难免在运行过程中各种各样的原因挂掉的情况,下面的代码在yarn平台使用shell来监控任务,如果挂掉就发送告警,并尝试重启任务
#/bin/bash
#==================================================
#purpose: flink&spark任务运行监控,每一分钟检查一次本地进程和yarn任务进程
#author:
#date:2019/10/12
#==================================================
#param1 $1:grep jar 进程:flink-ops-data-warn-1.0-SNAPSHOT-jar-with-dependencies.jar
#param2 $2:grep yarn 任务进程flink-api-err-cnt
#param3 $3:重启的shell脚本 flink-api-error.sh
funCheck(){
processdt=`date "+%Y-%m-%d %H:%M:%S"`
#查找是否有任务jar包的进程
count=`ps -ef | grep $1 | grep -v grep|wc -l`
if [ ${count} == 1 ]; then
true
echo "$processdt $2 is running"
elif [ ${count} == 0 ]; then
msg="$2挂了,正在尝试重启..."
echo "$processdt $msg"
#发送告警信息到企业微信
sh /opt/ops/bin/wechat-warn.sh "$processdt $msg"
#如果进程挂了,但yarn任务里还挂着,就kill掉这个yarn任务
yarnappid=`yarn application -list|grep $2|awk '{print $1}'`
yarn application -kill $yarnappid
#调用任务重启脚本
nohup sh /opt/ops/bin/$3 &
#重启需要时间,暂停两分钟
sleep 2m
processdt=`date "+%Y-%m-%d %H:%M:%S"`
#检查是否重启成功
count=`ps -ef | grep $1 | grep -v grep|wc -l`
if [ ${count} == 1 ]; then
echo "$processdt $2 已经重启 "
sh /opt/ops/bin/wechat-warn.sh "$processdt $2 已经重启"
else
msg="$2 重启失败,..."
echo "$processdt $msg"
sh /opt/ops/bin/wechat-warn.sh "$processdt $msg"
fi
else
#如果有多个yarn进程在运行,删掉其中一个,理论上不会有多余2个的
msg="$2有多个yarn进程在运行,删掉其中一个"
echo "$processdt $msg"
sh /opt/ops/bin/wechat-warn.sh "$processdt $msg"
yarnappid=`yarn application -list|grep $2| head -n 1|awk '{print $1}'`
yarn application -kill $yarnappid
fi
}
while true;
do
funCheck flink-ops-data-warn-1.0-SNAPSHOT-jar-with-dependencies.jar flink-api-err-cnt flink-api-error-start.sh
#添加监控任务
#每一分钟检查一次
sleep 1m
done