Nimbus和Supervisor都是快速失败,无状态的进程,Nimbus的单点问题一直没有很好的解决办法,所以我们可以对相关进程进行监控,在其挂掉时尝试重启。
在之前的项目里,比较常用的方式是通过monit对相关进程进行监控,通过monit监控需要对每台机器进行配置,可以通过Fabric(http://www.fabfile.org)进行统一配置,这里不详细介绍monit监控的方式。
通过Shell脚本可以实现在Nimbus节点上,监控整个集群,前提是需要Nimbus节点与Supervisor建立SSH无密码访问。具体内容如下
main.sh
#!bin/bash
dir=`dirname $0`
while [ 1 ]
do
echo "========== "`date`" ==============="
nid=`jps -l|grep 'nimbus'|awk '{print $1}'`
if [ "$nid" = "" ]; then
echo 'storm nimbus is dead!'
echo 'trying to start nimbus...'
nohup storm nimbus >nimbus.log &
echo 'finish starting!'
else
echo "storm nimbus id: $nid"
fi
uid=`jps -l|grep 'backtype.storm.ui.core'|awk '{print $1}'`
if [ "$nid" = "" ]; then
echo 'storm ui process is dead!'
echo 'trying to start storm ui'
nohup storm ui >ui.log &
echo 'finish starting storm ui!'
else
echo "storm ui id: $uid"
fi
sh $dir/storm_manager.sh start
echo "sleeping 20s..."
sleep 20
done
storm_manager.sh
#!bin/bash
slaves="cdn36 cdn37 cdn39 cdn21 cdn22 cdn23"
storm_dir='/data/tmp/storm'
check_supervisors(){
for node in $slaves
do
ssh $node
echo "finishing starting $node's supervisor"
else
echo "supervisor process id: \$sid"
fi
END
echo
done
}
#同步配置文件
sync_config(){
for node in $slaves
do
scp /opt/package/apache-storm-0.9.2-incubating/conf/storm.yaml root@$node:/opt/package/apache-storm-0.9.2-incubating/conf/
echo "finishing sync $node config!"
done
}
mytest(){
for node in $slaves
do
ssh $node <