这个脚本只是一个临时的方案,在全面的监控,报警系统完成之前为了节省一定的人力而写的
之前写过一个脚本用来监控Storm的Nimbus和Supervisor进程,在检测到进程不存在时会进行重启,在实际使用中发现,该脚本虽然可以不断重启,但某些情况下并不能完成自动重启,人为的介入仍然是必要的,所以对脚本进行了修改,增加检测重启次数,如果检测到重启失败,则会通过邮件通知相关人员。遇到的困难主要在于获取ssh操作的返回值,思路比较简单,脚本内容如下:
#!/bin/bash
dir=`dirname $0`
slaves="cdn36 cdn37 cdn39 cdn21 cdn22 cdn23"
stopnode=""
while [ 1 ]
do
echo "========== "`date`" ==============="
nid=`jps -l|grep 'nimbus'|awk '{print $1}'`
if [ "$nid" = "" ]; then
echo 'storm nimbus is dead!'
echo 'trying to start nimbus...'
nohup storm nimbus >nimbus.log &
echo 'finish starting!'
else
echo "storm nimbus id: $nid"
fi
uid=`jps -l|grep 'backtype.storm.ui.core'|awk '{print $1}'`
if [ "$nid" = "" ]; then
echo 'storm ui process is dead!'
echo 'trying to start storm ui'
nohup storm ui >ui.log &
echo 'finish starting storm ui!'
else
echo "storm ui id: $uid"
fi
stopnode=""
for node in $slaves
do
tmp=$(ssh $node 'source /etc/profile; source ~/.bash_profile;sid=`jps |grep supervisor |awk "{print $1}"`;
if [ "$sid" = "" ]; then
echo "supervisor is dead trying to start supervisor!";
mkdir -p ~/rzx;
#重启前 删除worker文件
rm -fr /data/tmp/storm/worker;
nohup storm supervisor >supervisor.log &
else
echo " supervisor is alived,"${sid};
fi')
alived="alived"
if [ "$tmp" = "${tmp//$alived}" ] ; then
stopnode=${stopnode}" "${node}
echo ${node}"'s supervisor is dead!"
tmp=""
else
echo ${node}"'s"${tmp}
tmp=""
fi
done
#sleep 等待Supervisor重启完成
sleep 40
#检测是否有Supervisor dead
if [ -n "$stopnode" ] ;then
echo "check dead supervisor!"
for node in $stopnode
do
check=$(ssh $node 'source /etc/profile; source ~/.bash_profile;sid=`jps |grep supervisor |awk "{print $1}"`;
if [ "$sid" = "" ]; then
echo "supervisor is still dead!";
else
echo " supervisor is alived,"${sid};
fi')
if [ "$check" = "${check//$alived}" ] ; then
echo ${node}"'s supervisor is still dead, send the email to admin!"
title="Supervisor--is--dead"
contxt=${node}"'s--supervisor--is--dead,please--check--the--server!"
sh /data/www/mail/bin/start.sh #发送邮件脚本
slaves1=${slaves/${node}/""}
slaves=""
slaves=${slaves1}
check=""
else
echo ${node}"'s"$check
check=""
fi
done
else
echo "no dead supervisor!"
fi
if [ "$nid" = "" ]; then
nid1=`jps -l|grep 'nimbus'|awk '{print $1}'`
if [ "$nid1" = "" ]; then
echo "nimbus is still dead, send the email to admin!"
title1="Nimbus--is--dead"
contxt1="Nimbus--is--dead,please--check--the--server!"
sh /data/www/mail/bin/start.sh #发送邮件脚本
else
echo "Nimbus is restarted!"
fi
else
echo
fi
echo "sleeping 20s..."
sleep 20
done