IP | Hostname | 备注 |
---|---|---|
192.168.88.31 | smaster | nagios服务端主节点 |
192.168.88.32 | client | nagios客户端 |
192.168.88.41 | sbackup | nagios服务端备节点 |
环境说明:firewalld关闭,selinux关闭。
软件版本:
nagios版本:4.4.3
nagios-plugins版本:2.2.1
nrpe版本:3.2.1
Nagios相关博客:
1.设置邮件告警次数只通知1次
在监控服务项添加normal_check_interval 0
//例如
[root@sbackup ~]# vi /usr/local/nagios/etc/objects/192.168.88.32.cfg
define service {
use local-service
host_name client
service_description Nginx port status
check_command check_nrpe!check_nginx
normal_check_interval 0 //定义为0,报警只发送一次,不重发。
max_check_attempts 1 //定义为1,检测到问题后立即报警,不重试.
contact_groups admins
}
[root@sbackup ~]# systemctl restart nagios
//查看日志
[root@sbackup ~]# tail -f /var/log/messages
Mar 27 17:55:44 sbackup nagios: SERVICE ALERT: client;Nginx port status;CRITICAL;HARD;1;CRITICAL,nginx is not working!
Mar 27 17:55:44 sbackup nagios: job 77 (pid=3632): read() returned error 11
//只发送了一次邮件
2.将上次设置的远程客户端上的监控,部署上邮件告警。
//只需要在监控服务项中添加联系组即可
define host{
use linux-server
host_name client
address 192.168.88.32
contact_groups admins //联系组
}
define service {
use local-service
host_name client
service_description Nginx port status
check_command check_nrpe!check_nginx
normal_check_interval 1
contact_groups admins //联系组
}
[root@sbackup ~]# systemctl restart nagios
邮件效果图:
3.nagios后期维护
总结如下:
1.如何更快速的增加主机?
2.如何调整合适的阀值?
3.如何保证nagios稳定的运行?
①.快速的增加主机
[root@smaster ~]# cd /usr/local/nagios/etc/objects/
[root@smaster objects]# mkdir servers/
[root@smaster objects]# vi ../nagios.cfg
cfg_dir=/usr/local/nagios/etc/objects/servers //添加
[root@smaster objects]# cd servers/
[root@smaster servers]# vi Create.sh //批量生成客户端脚本
#!/bin/bash
usage () {
echo -en "USAGE: $0 [host list] or $0 [template] [host list]\nFor example: $0 host.template host.list(Field : [IP] [HOST NAME])\n" 1>&2
exit 1
}
if [ $# -gt 2 ];then
usage
exit 1
fi
case "$#" in
2)
template=$1
host_list=$2
;;
1)
template='host.template'
host_list=$1
;;
0)
# template='host.template'
# host_list='host.list'
usage
;;
esac
if [ ! -f "${template}" ];then
echo "template : ${template} not exist!" 1>&2
exit 1
fi
if [ ! -f "${host_list}" ];then
echo "host list : ${host_list} not exist!" 1>&2
exit 1
fi
cat ${host_list}|\
while read ip hostname
do
echo "${ip}"|grep -oP '^\d{1,3}(\.\d{1,3}){3}$' >/dev/null 2>&1 || Field='not ip'
if [ "${Field}" = 'not ip' ];then
echo "${ip} not ip!" 1>&2
exit 1
fi
host_cfg="${hostname}-${ip}.cfg"
cp ${template} ${host_cfg}
sed -i "s/HOST_NAME/${hostname}/g;s/ADDRESS/${ip}/g" ${host_cfg}
done
[root@smaster servers]# vi linux.template //批量生成的模板
define host {
use linux-server
host_name HOST_NAME
alias HOST_NAME
address ADDRESS
}
define service {
use local-service
host_name HOST_NAME
service_description PING
check_command check_ping!100.0,20%!500.0,60%
}
define service {
use local-service
host_name HOST_NAME
service_description Root Partition
check_command check_local_disk!20%!10%!/
}
define service {
use local-service
host_name HOST_NAME
service_description Total Processes
check_command check_local_procs!250!400!RSZDT
}
[root@smaster servers]# vi list.text //ip和主机列表
192.168.88.41 sbackup
[root@smaster servers]# chmod 755 *
[root@smaster servers]# ./Create.sh linux.template list.text
[root@smaster servers]# ll
总用量 16
-rwxr-xr-x 1 nagios nagios 1218 3月 27 18:38 Create.sh
-rw-r--r-- 1 root root 783 3月 27 18:56 linux.template
-rwxr-xr-x 1 nagios nagios 43 3月 27 18:36 list.text
-rw-r--r-- 1 root root 779 3月 27 18:56 sbackup-192.168.88.41.cfg
[root@smaster servers]# chown -R nagios.nagios ../servers/
[root@smaster servers]# /usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg
Checking objects...
Checked 14 services.
Checked 3 hosts.
Checked 1 host groups.
Checked 0 service groups.
Checked 1 contacts.
Checked 1 contact groups.
Checked 25 commands.
Checked 5 time periods.
Checked 0 host escalations.
Checked 0 service escalations.
Checking for circular paths...
Checked 3 hosts
Checked 0 service dependencies
Checked 0 host dependencies
Checked 5 timeperiods
Checking global event handlers...
Checking obsessive compulsive processor commands...
Checking misc settings...
Total Warnings: 0
Total Errors: 0
[root@smaster servers]# systemctl restart nagios
Nagios网页监控效果图:
②.调整合适的阀值
使用crontab写入一个定时任务
[root@smaster ~]# crontab -e
*/5 * * * * ps -ef | wc -l >> /root/total_procs.txt //每五分钟收集线程数到该文档中
[root@smaster ~]# crontab -l
*/5 * * * * ps -ef | wc -l >> /root/total_procs.txt
[root@smaster ~]# cat total_procs.txt //可以看到每5分钟的线程数
138
137
137
132
③. 保证nagios稳定的运行
[root@smaster ~]# vi /usr/sbin/procs.sh //编写一个监控nagios线程的脚本
#!/bin/bash
procs=`ps -ef |grep nagios |grep -v grep |wc -l`
if [ $procs -le 1 ];then //如果线程数小于等于1则重启一次
systemctl restart nagios;
procs1=`ps -ef |grep nagios |grep -v grep |wc -l` //再次查询线程数
if [ $procs1 -le 1 ];then //如果线程数小于等于1则
echo "Nagios_Procs WARING!" | mail -s "Nagios_procs<=1,The nagios service may shut down" 577*****@qq.com
exit;
else
exit;
fi
fi
[root@sbackup ~]# sh -x /usr/sbin/procs.sh
++ ps -ef
++ grep nagios
++ grep -v grep
++ wc -l
+ procs=1
+ '[' 1 -le 1 ']'
+ systemctl restart nagios
++ ps -ef
++ grep -v grep
++ wc -l
++ grep nagios
+ procs1=6
+ '[' 6 -le 1 ']'
+ exit
//如果nagios配置文件出错的情况下
[root@sbackup ~]# systemctl stop nagios
[root@sbackup ~]# sh -x /usr/sbin/procs.sh
++ ps -ef
++ grep nagios
++ grep -v grep
++ wc -l
+ procs=1
+ '[' 1 -le 1 ']'
+ systemctl restart nagios
Job for nagios.service failed because the control process exited with error code. See "systemctl status nagios.service" and "journalctl -xe" for details.
++ grep nagios
++ grep -v grep
++ ps -ef
++ wc -l
+ procs1=1
+ '[' 1 -le 1 ']'
+ echo 'Nagios_procs<=1,The nagios service may shut down'
+ mail -s 'Nagios_Procs WARING!' 577*****@qq.com
+ exit
[root@sbackup ~]# crontab -e //每5分钟执行一次检测脚本。
*/5 * * * * sh /usr/sbin/procs.sh
邮件告警截图: