使用shell脚本实现对confluence服务健康状况的监控,并在服务挂掉时,及时重新启动服务
#!/bin/bash
#create by kjl
#2019-11-22
#参数
ip=172.20.**.**
port=9090
logfile=/home/work/confluence/data/wiki_check.log
confluence_log=/home/work/confluence/atlassian-confluence-5.6.5/logs
sleep_time=60
#钉钉
phone=138*********
url=https://oapi.dingtalk.com/robot/send?access_token=*******************
message()
{
curl -H "Content-type: application/json" -X POST -d '{"msgtype": "text","text": {"content": "'$1'"}, "at": {"atMobiles": ["'$phone'"], "isAtAll": false}}' $url
}
#返回状态码检查
returnnode_check(){
#检查间隔
per_time=10
#状态码正常的次数
right_times=0
for (( i=1; i <= 3; i++ ))
do
if [[ `curl -I -m 10 -o /dev/null -s -w %{http_code} $ip:$port` -lt 400 ]];then
#如果切换到work用户执行,服务内部有错,拿到的状态吗是500000
let right_times++
fi
sleep $per_time
done
}
#进程端口检查,heal_num为1表示进程或端口出问题,为0表示均正常
pp_check(){
nc -z $ip $port
#判断端口和进程健康状况
if [[ $? -ne 0 || `ps -ef | grep confluence | wc -l` -eq 1 ]];then
heal_num=1
else
heal_num=0
fi
}
#死循环
while [[ true ]];do
sleep $sleep_time
now_date=$(date "+%Y-%m-%d")
now_time=$(date "+%Y-%m-%d %H:%M:%S")
#首次检查
returnnode_check
pp_check
if [[ $right_times -lt 3 || $heal_num -eq 1 ]];then
#触发重启
#钉钉通知服务异常
#message "ding~服务异常,正在尝试重启..."
echo $now_time"**********服务异常,尝试重启...*************" >>$logfile
pid=`ps aux |grep 'confluence' |grep -v grep |grep -v 'wiki_check' | awk '{print $2}'`
if [[ -n ${pid} ]]; then
kill -9 ${pid}
fi
cp -a $confluence_log/catalina.out $confluence_log/catalina.$now_date.log && echo '' > $confluence_log/catalina.out
/home/work/confluence/atlassian-confluence-5.6.5/bin/start-confluence.sh"
#日志中可以捕捉到“INFO: Server startup in”信息,但是状态吗不对
else
echo $now_time"**********服务健康状况正常*************">>$logfile
continue
fi
i=0
while [[ $i -lt 60 ]]; do
grep 'INFO: Server startup in' $confluence_log/catalina.out
if [[ $? -eq 0 ]];then
break
fi
let i++
sleep 10
done
#再次复核
returnnode_check
pp_check
if [[ $right_times -lt 3 || $heal_num -eq 1 ]];then
#重启后服务仍未恢复
#钉钉通知人工处理
message "ding~wiki服务重启失败,请尽快排查问题"
echo $now_time"**********服务重启失败*************">>$logfile
else
echo $now_time"**********服务重启成功*************">>$logfile
fi
done