搭建nagios
准备:将主机关闭,克隆出另一台虚拟机出来,启动克隆虚拟机修改IP,之后启动主机
IP | hostname | 备注 |
---|---|---|
192.168.174.128 | localhost | 主机 |
192.168.174.130 | node2 | 备用机 |
文件关联
配置文件 | 作用 |
---|---|
nrpe.cfg | nrpe的主配置文件 |
objects/command.cfg | nagios监控命令的定义文件 |
objects/localhost.cfg | 定义监控的主机,组,服务,此处引用的模板来源于objects/templates.cfg,监控命令来源于objects/command |
objects/templates.cfg | 定义主机和服务的模板文件 |
nagios.cfg | nagios的主配置文件 |
先将上一篇博客中监控的端口服务注释掉(主备都要)
[root@localhost ~]# cat /usr/local/nagios/etc/objects/services.cfg
#define service{
# use generic-service
# host_name node1
# service_description nginx
# check_command check_nrpe!check_nginx
# }
#define service{
# use generic-service
# host_name node1
# service_description mysql
# check_command check_nrpe!check_mysql
# }
#define service{
# use generic-service
# host_name node1
# service_description httpd
# check_command check_nrpe!check_httpd
# }
编辑环境变量,为了方便后面操作(主备都需要执行)
[root@localhost etc]# vi /etc/profile
export nagios=/usr/local/nagios/libexec //添加以下
export PATH=$PATH:$nagios 两行
[root@localhost etc]# source /etc/profile //生效配置
开放监控(主从都要做)
[root@localhost etc]# vi /usr/local/nagios/etc/nrpe.cfg
allowed_hosts=127.0.0.1,::1,192.168.174.130 //加上对方ip,修改完后重启nrpe
[root@localhost etc]pkill nrpe
[root@localhost etc]/usr/local/nagios/bin/nrpe -d -c /usr/local/nagios/etc/nrpe.cfg
[root@localhost etc]# check_nrpe -H 192.168.174.130 //测试能否连接
NRPE v3.2.1
添加命令(主机)
[root@localhost etc]# vi /usr/local/nagios/etc/nrpe.cfg
command[check_nagios]=/usr/local/nagios/libexec/check_nagios -e 5 -F /usr/local/nagios/var/status.dat -C /usr/local/nagios/bin/nagios //添加命令
[root@localhost etc]pkill nrpe 重启nrpe
[root@localhost etc]/usr/local/nagios/bin/nrpe -d -c /usr/local/nagios/etc/nrpe.cfg
[root@node2 etc]# check_nrpe -H 192.168.174.128 -c check_nagios //在客户端测试
NAGIOS OK: 6 processes, status log updated 5 seconds ago
备用机配置
复制相关文件并修改
[root@node2 etc]# cd /usr/local/src/nagioscore-nagios-4.4.3/contrib/eventhandlers/
[root@node2 eventhandlers]cp enable_notifications /usr/local/nagios/libexec/eventhandlers/
[root@node2 eventhandlers]cp disable_notifications /usr/local/nagios/libexec/eventhandlers/
[root@node2 eventhandlers]cp redundancy-scenario1/handle-master-host-event /usr/local/nagios/libexec/eventhandlers/
[root@node2 eventhandlers]cp redundancy-scenario1/handle-master-proc-event /usr/local/nagios/libexec/eventhandlers/
[root@node2 eventhandlers]sed -i 's/active_service_checks/notifications/g' /usr/local/nagios/libexec/eventhandlers/handle-master-proc-event
配置添加命令 command
[root@node2 eventhandlers]# cd /usr/local/nagios/etc/objects/
[root@node2 objects]#vi /usr/local/nagios/etc/objects/commands.cfg
define command {
command_name handle-master-host-event
command_line $USER1$/eventhandlers/handle-master-host-event $HOSTSTATE$ $HOSTSTATETYPE$ $HOSTATTEMPT$
}
define command {
command_name handle-master-proc-event
command_line $USER1$/eventhandlers/handle-master-proc-event $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$
}
define command {
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}
修改自己配置的主机文件
[root@node2 objects]# vi /usr/local/nagios/etc/objects/hosts.cfg
define host {
use critical-host
host_name nagiosMaster
alias nagios master
address 192.168.174.128
event_handler handle-master-host-event
}
define service {
use critical-service
host_name nagiosMaster
service_description NAGIOS
check_command check_nrpe!check_nagios
event_handler handle-master-proc-event
}
修改模板文件
[root@node2 objects]# vi /usr/local/nagios/etc/objects/templates.cfg
define host{
name critical-host
use generic-host
check_period 24x7
check_interval 5
retry_interval 1
max_check_attempts 10
check_command check-host-alive
notification_period workhours
notification_interval 120
notification_options d,u,r
contact_groups admins
register 0
}
define service{
name critical-service
active_checks_enabled 1
passive_checks_enabled 1
parallelize_check 1
obsess_over_service 1
check_freshness 0
notifications_enabled 1
event_handler_enabled 1
flap_detection_enabled 1
failure_prediction_enabled 1
process_perf_data 1
retain_status_information 1
retain_nonstatus_information 1
is_volatile 0
check_period 24x7
max_check_attempts 1
normal_check_interval 1
retry_check_interval 1
contact_groups admins
notification_options w,u,c,r
notification_interval 60
notification_period 24x7
register 0
}
修改主配置文件
[root@node2 objects]# vi /usr/local/nagios/etc/nagios.cfg
# Values: 1 = enable notifications, 0 = disable notifications
enable_notifications=0
#改成不发告警
use_retained_program_state=0
#状态保持改成0,否则Nagios在启动和重启时将忽略notifications的设置,并采用最近的一个设置(比如你已经切换过一次发告警的状态)
配置邮件告警
[root@node2 objects]# yum install -y mailx //安装mail
[root@node2 objects]# vi /usr/local/nagios/etc/objects/commands.cfg //修改邮件命令
define command {
command_name notify-host-by-email
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n\nDate/Time: $LONGDATETIME$\n" | /bin/mail -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$
}
define command {
command_name notify-service-by-email
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$\n" | /bin/mail -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$
}
最后重启备用机nagios
[root@node2 objects]# nagios -v /usr/local/nagios/etc/nagios.cfg //检查配置文件
[root@node2 objects]# systemctl restart nagios
测试
将主机nagios关闭查看从机日志
[root@node2 objects]# tail -f 20 /var/log/messages
Mar 25 13:50:41 node2 nagios: SERVICE ALERT: 192.168.174.128;NAGIOS;CRITICAL;HARD;1;NAGIOS CRITICAL: Cannot open status log for reading!
Mar 25 13:50:41 node2 nagios: SERVICE EVENT HANDLER: 192.168.174.128;NAGIOS;CRITICAL;HARD;1;handle-master-proc-event
Mar 25 13:50:41 node2 nagios: EXTERNAL COMMAND: ENABLE_NOTIFICATIONS;1585158641
...
Mar 25 16:33:00 node2 nagios: SERVICE NOTIFICATION: nagiosadmin;localhost;Total Processes;WARNING;notify-service-by-email;PROCS WARNING: 303 processes with STATE = RSZDT
查看邮件
[root@node2 objects]# cat /var/spool/mail/nagios
From nagios@node2.localdomain Wed Mar 25 16:33:00 2020
Return-Path: <nagios@node2.localdomain>
X-Original-To: nagios@localhost
Delivered-To: nagios@localhost.localdomain
Received: by node2.localdomain (Postfix, from userid 1000)
id 5496840E9409; Wed, 25 Mar 2020 16:33:00 -0400 (EDT)
Date: Wed, 25 Mar 2020 16:33:00 -0400
To: nagios@localhost.localdomain
Subject: ** PROBLEM Service Alert: localhost/Total Processes is WARNING **
User-Agent: Heirloom mailx 12.5 7/5/10
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
Message-Id: <20200325203300.5496840E9409@node2.localdomain>
From: nagios@node2.localdomain
***** Nagios *****
Notification Type: PROBLEM
Service: NAGIOS
Host: nagios master
Address: 192.168.174.128
State: CRITICAL
Date/Time: Wed Mar 25 16:33:00 EDT 2020
Additional Info:
NAGIOS CRITICAL: Cannot open status log for reading!
收到邮件,成功