根据公司业务写的检测监控脚本,计划任务每分钟执行一次,有节点挂掉每20分钟发送一次钉钉告警,连续5次告警就从均衡下掉该节点
但是没有恢复通知
#!/bin/bash
#Description:yw_node_monitor,send to dingtalk
#Author:xxx
#version:1.1
source /etc/profile
URL='https://oapi.dingtalk.com/robot/send?access_token=xxx'
#URL='https://oapi.dingtalk.com/robot/send?access_token=xxx'
BASEDIR='/App/apache-jmeter-jjh-1.0.0'
remove_node(){
echo "auto remove node "$x""
}
jishu(){
t_s=`date +%s`
t_s2=`date -d "2 hours ago" +%s`
#临时记录时间的文件
if [ ! -f /tmp/$x ]
then
echo $t_s2 > /tmp/$x
fi
t_s2=`tail -1 /tmp/$x|awk '{print $1}'` #上次告警发生时间
echo $t_s > /tmp/$x #当前告警发生时间
v=$[$t_s-$t_s2] #时间差
echo $v
#如果距离上次告警大于1小时,说明是新的告警(不准确)
if [ $v -gt 3600 ]
then
send_dingtalk #发送通知,计数文件清0
echo "0" > /tmp/$x.txt
else
if [ ! -f /tmp/$x.txt ]
then
echo "0" > /tmp/$x.txt
fi
nu=`cat /tmp/$x.txt`
nu2=$((${nu}+1))
echo $nu2 > /tmp/$x.txt
#连续五次失败,调用下节点函数
if [ $nu2 -gt 5 ]
then
remove_node
fi
if [ $nu2 -gt 20 ] #告警20次发一次通知
then
send_dingtalk
echo "0" > /tmp/$x.txt
fi
fi
}
send_dingtalk(){
curl ''$URL'' \
-H 'Content-Type: application/json' \
-d '
{
"msgtype": "text",
"text": {
"content": "'$falure_name' 宕, IP '$x',请及时处理\n测试。。测试。。"
},
"at": {
"atMobiles": [
"xxx"
],
}
}'
}
cd $BASEDIR/log/
if [ -f result.ftl ];then
mv result.ftl result.ftl.$(date "+%Y%m%d-%H:%M:%S")
fi
$BASEDIR/bin/jmeter -n -t $BASEDIR/bin/check.jmx -l $BASEDIR/log/result.ftl
falure_ip=`grep -C 3 "<failure>true</failure>" $BASEDIR/log/result.ftl | egrep -o "(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])" | sort -n | uniq `
echo $falure_ip > $BASEDIR/log/falure_ip
if [[ ! -z $falure_ip ]];then
#遍历每个失败的IP
for x in $falure_ip;do
falure_name=`grep $x $BASEDIR/bin/dlg.txt | awk -F "," '{print $2}' | tr -d '\r'`
jishu
done
fi
新手,请多多指教!
转载于:https://blog.51cto.com/weifan/2084259