1.系统:CentOS 5.2

2.安装前准备:
1).安装好apache,GD库(我的GD库安装在/usr/local/libgd)。
2).下载Nagios3.0.5,Nagios-plugins1.4.11,nrpe2.12,nsclient++
# http://www.nagios.org/download/

wget http://osdn.dl.sourceforge.net/sourceforge/nagios/nagios-3.0.5.tar.gz
wget http://osdn.dl.sourceforge.net/sourceforge/nagiosplug/nagios-plugins-1.4.11.tar.gz

# Nagios3.0.5,Nagios-plugins1.4.11,nrpe2.12安装在监控服务器上。
# 去http://www.nagios.org/download/addons/下载nrpe和nsclient++插件。
# nrpe2.12安装在Linux/Unix被监控端。
# nsclient++安装在Windows被监控端。
3).创建帐号及组
# 创建帐号

/usr/sbin/useradd -m nagios
passwd nagios

# 创建组

/usr/sbin/groupadd nagcmd
/usr/sbin/usermod -a -G nagcmd nagios
/usr/sbin/usermod -a -G nagcmd daemon

# daemon为运行apache的帐号。

3.安装nagios

tar xzf nagios-3.0.5.tar.gz
cd nagios-3.0.5
./configure –with-command-group=nagcmd –with-gd-lib=/usr/local/libgd/lib/ –with-gd-inc=/usr/local/libgd/include/
make all
# 使用make install来安装主程序,CGI和HTML文件
make install
# 使用make install-init在/etc/rc.d/init.d安装启动脚本
make install-init
# 使用make install-cofig来安装示例配置文件,安装的路径是/usr/local/nagios/etc.
make install-config
# 使用make install-commandmode来配置目录权限
make install-commandmode

nagios目录功能的简要说明:

  • bin Nagios执行程序所在目录,nagios文件即为主程序
  • etc Nagios配置文件位置
  • sbin Nagios Cgi文件所在目录,也就是执行外部命令所需文件所在的目录
  • Share Nagios网页文件所在的目录
  • var Nagios日志文件、spid 等文件所在的目录
  • var/archives 日志归档目录
  • var/rw 用来存放外部命令文件

配置apache
将现面行加入apache配置文件的alias模块<IfModule alias_module>

----------------------------------------
ScriptAlias /nagios/cgi-bin “/usr/local/nagios/sbin”

<Directory “/usr/local/nagios/sbin”>
# SSLRequireSSL
Options ExecCGI
AllowOverride None
Order allow,deny
Allow from all
# Order deny,allow
# Deny from all
# Allow from 127.0.0.1
AuthName “Nagios Access”
AuthType Basic
AuthUserFile /usr/local/nagios/etc/htpasswd.users
Require valid-user
</Directory>

Alias /nagios “/usr/local/nagios/share”

<Directory “/usr/local/nagios/share”>
# SSLRequireSSL
Options None
AllowOverride None
Order allow,deny
Allow from all
# Order deny,allow
# Deny from all
# Allow from 127.0.0.1
AuthName “Nagios Access”
AuthType Basic
AuthUserFile /usr/local/nagios/etc/htpasswd.users
Require valid-user
</Directory>
----------------------------------------

# 创建apache目录验证文件

/usr/local/apache/bin/htpasswd -c /usr/local/nagios/etc/htpasswd.users nagiosadmin
New password: (输入密码)
Re-type new password: (再输入一次密码)
Adding password for user nagiosadmin

# 重启apache:

/usr/local/apache/bin/apachectl -k restart

# 配置nagios配置文件

vi /usr/local/nagios/etc/objects/contacts.cfg
# 将里面的email地址改为自己的email地址。

4.安装Nagios插件

tar xzf nagios-plugins-1.4.11.tar.gz
cd nagios-plugins-1.4.11
./configure –with-nagios-user=nagios –with-nagios-group=nagios
make
make install

5.启动Nagios

# 配置机器启动时自动启动Nagios
chkconfig –add nagios
chkconfig nagios on
# 检查Nagios配置文件
/usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg
# 启Nnagios
service nagios start

6.SELinux及防火墙设置
1).关闭SELinux

vi /etc/selinux/config
# 将下面行设为disabled
SELINUX=disabled

# 重启系统

2).如果开启防火墙,应该允许访问apache(一般为80端口)并允许nagios去抓取被监控机信息(一般nrpe为5666端口)。

7.访问Nagios服务器

http://localhost/nagios/
输入用户名及密码登录。

8.安装nrpe插件,用来监控Linux机器

tar xzvf nrpe-2.12.tar.gz
cd nrpe-2.12
./configure
make all

# 在Nagios服务器端只要安装nrpe监控插件就行

make install-plugin

在/usr/local/nagios/etc/objects/commands.cfg中定义check_nrpe命令

vi /usr/local/nagios/etc/objects/commands.cfg
############################################################
#
# 2008.11.18 add by Stone
# NRPE COMMAND
#
############################################################
# ‘check_nrpe ‘ command definition
define command{
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}

9.在被监控服务器(Linux/unix)上安装Nagios-plugins和nrpe

useradd nagios
tar xzvf nagios-plugins-1.4.13.tar.gz
cd nagios-plugins-1.4.13

# Nagios-plugins默认安装到/usr/local/nagios

./configure
make
make install
chown nagios.nagios /usr/local/nagios/
chown -R nagios.nagios /usr/local/nagios/libexec/

tar xzvf nrpe-2.12.tar.gz
cd nrpe-2.12
./configure
make all

# 安装nrpe插件,本监控端可以不装
make install-plugin
# 安装nrpe守护进程
make install-daemon
# 安张nrpe配置文件
make install-daemon-config
# 修改nrpe配置文件,允许Nagios监控服务器(192.168.0.19)监控
vi nrpe.cfg
# 多台机器用逗号隔开
allowed_hosts=127.0.0.1,192.168.0.19
# 以独立守护进程启动nrpe,也可以使用xinetd启动nrpe,具体清查看nrpe官方文档。
/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
# 开机自动启动nrpe
vi /etc/rc.d/rc.local
# 加入下面行
/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
# 检查nrpe是否安装正常
[root@wiki etc]# /usr/local/nagios/libexec/check_nrpe -H localhost
NRPE v2.12
# 返回nrpe版本说明安装没问题。

# 查看启动端口

[root@wiki ~]# netstat -tunlp
Active Internet connections (only servers)
Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name
tcp 0 0 0.0.0.0:5666 0.0.0.0:* LISTEN 27387/nrpe

如果有防火墙应该开放5666端口:

iptables -I eth0 -p tcp -m tcp -dport 5666 -j ACCEPT

*********************************************
注意:我们需要在/usr/local/nagios/etc/nrpe.cfg中定义我们用到的监控本地资源的命令。
下面的命令是默认定义的:

command[check_users]=/usr/local/nagios/libexec/check_users -w 5 -c 10
command[check_load]=/usr/local/nagios/libexec/check_load -w 15,10,5 -c 30,25,20
command[check_hda1]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p /dev/hda1
command[check_zombie_procs]=/usr/local/nagios/libexec/check_procs -w 5 -c 10 -s Z
command[check_total_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 200

下面的命令是自己定义的:
# 监控交换分区的使用情况,使用超过20%时为警告状态,超过10%时为严重状态

command[check_swap]=/usr/local/nagios/libexec/check_swap -w 20% -c 10%

# 监控根分区磁盘使用情况

command[check_disk_root]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p /

*********************************************

Linux机器监控及配置文件讲解

10.Nagios如何监控Linux机器

monitor

NRPE总共由两部分组成:
(1).check_nrpe插件,运行在监控主机上。
(2).NRPE daemon,运行在远程的linux主机上(通常就是被监控机)
按照上图,整个的监控过程如下:
当Nagios需要监控某个远程linux主机的服务或者资源情况时:
1).nagios会运行check_nrpe插件,我们要在nagios配置文件中告诉它要检查什么.
2).check_nrpe插件会通过SSL连接到远程的NRPE daemon.
3).NRPE daemon会运行相应的nagios插件来执行检查本地资源或服务.
4).NRPE daemon将检查的结果返回给check_nrpe插件,插件将其递交给nagios做处理.
注意:NRPE daemon需要nagios插件安装在远程被监控linux主机上,否则,daemon不能做任何的监控.

11.Nagios的配置文件
# 控制cgi访问的配置文件
cgi.cfg
# Nagios主配置文件
nagios.cfg
# resource.cfg定义了一些变量,以便被其他文件引用,如$USER1$
resource.cfg
# objects是一个目录,用于定义Nagios对象
objects
# servers是自己创建的一个目录,Nagios可以加载一个目录下面的所有配置文件(需要在nagios.cfg中配置)
servers

./objects:
# 命令定义配置文件,里面定义的命令可以被其他文件引用
commands.cfg
# 联系人和联系人组配置文件
contacts.cfg
# 监控本地机器的配置文件
localhost.cfg
# 监控打印机的一个事例配置文件(默认未启用)
printer.cfg
# 监控路由器的一个事例配置文件(默认未启用)
switch.cfg
# 模板配置文件,在此可以定义模板,在其他文件中引用
templates.cfg
# 定义监控时间段的配置文件
timeperiods.cfg
# 监控Windows的一个事例配置文件(默认未启用)
windows.cfg

./servers:
# 自己创建的主机群组配置文件
hostgroup.cfg
# 自己创建的监控远程Linux主机的配置文件
wiki-l-11.cfg

配置文件是怎样引用的?

config
用 nagios主要是监控一台主机的各种信息,包括本机资源以及对外的服务等等.这些在nagios里面都是被定义为一个个的项目(nagios称之为服 务,为了与主机提供的服务相区别,我这里用项目这个词),而实现每个监控项目,则需要通过commands.cfg文件中定义的命令。
为了不必重复定义一些项目,Nagios引入了一个模板配置文件(templates.cfg),将一些共性的属性定义成模板,以便于多次引用。
我们现在有一个监控项目是监控一台机器的web服务是否正常, 我们需要哪些元素呢?最重要的有下面三点:首先是监控哪台机器,然后是这个监控要用什么命令实现,最后就是出了问题的时候要通知哪个联系人。
我们首先应该在commands.cfg中定义监控远程服务和资源的命令,以及如何发送邮件的命令。大部分监控远程服务和资源的命令的命令通过/usr/local/nagios/libexec下的脚本实现,如ping命令为check_ping。
/usr/local/nagios/libexec下的脚本命令的使用发法可以通过-h参数查看,如:

—————————————————————————————–
[root@tech ~]# /usr/local/nagios/libexec/check_ping -h
check_ping v1991 (nagios-plugins 1.4.13)
Copyright (c) 1999 Ethan Galstad <nagios@nagios.org>
Copyright (c) 2000-2007 Nagios Plugin Development Team
<nagiosplug-devel@lists.sourceforge.net>

Use ping to check connection statistics for a remote host.

Usage:check_ping -H <host_address> -w <wrta>,<wpl>% -c <crta>,<cpl>%
[-p packets] [-t timeout] [-4|-6]

Options:
-h, –help
Print detailed help screen
-V, –version
Print version information
-4, –use-ipv4
Use IPv4 connection
-6, –use-ipv6
Use IPv6 connection
-H, –hostname=HOST
host to ping
-w, –warning=THRESHOLD
warning threshold pair
-c, –critical=THRESHOLD
critical threshold pair
-p, –packets=INTEGER
number of ICMP ECHO packets to send (Default: 5)
-L, –link
show HTML in the plugin output (obsoleted by urlize)
-t, –timeout=INTEGER
Seconds before connection times out (default: 10)
—————————————————————————————–

然后我们在contacts.cfg文件中定义联系人和联系人组,在timeperiods.cfg中定义监控时间段。最后我们在服务器监控配置文件中引用前面定义的元素来监控服务器状态。

================================================
下面引用配置文件中部分配置做说明:

vi /usr/local/nagios/etc/resource.cfg
# 定义$USER1$变量,设置插件路径
$USER1$=/usr/local/nagios/libexec

vi /usr/local/nagios/etc/objects/commands.cfg
# 定义check-host-alive命令
define command{
command_name check-host-alive # 命令名称
command_line $USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5
}
# 上面的$USER1$和$HOSTADDRESS$引用自已定义的配置文件。变量不需现定义才能被引用。
########################################################################
#
# 2008.11.18 add by Stone
# NRPE COMMAND
# 自己定义check_nrpe命令,此命令后接必需接一个参数,用于告诉远程服务器上的NRPE daemon需要监控的内容,如check_swap参数为监控远程机器的交换分区。
########################################################################
# ‘check_nrpe ‘ command definition
define command{
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}
vi /usr/local/nagios/etc/objects/contacts.cfg
# 定义联系人
define contact{
contact_name nagiosadmin ; Short name of user
use generic-contact ; Inherit default values from generic-contact template (defined above)
alias Nagios Admin ; Full name of user

email test@gmaile.com ; <<***** CHANGE THIS TO YOUR EMAIL ADDRESS ******
}
# 上面的generic-contact在templates.cfg中定义。
# 定义联系人组
define contactgroup{
contactgroup_name admins
alias Nagios Administrators
members nagiosadmin #在此可以加入多个联系人,中间用逗号隔开
}

vi /usr/local/nagios/etc/objects/timeperiods.cfg
# 定义监控的时间段
define timeperiod{
timeperiod_name 24×7 #监控所有时间段(7*24小时)
alias 24 Hours A Day, 7 Days A Week
sunday 00:00-24:00
monday 00:00-24:00
tuesday 00:00-24:00
wednesday 00:00-24:00
thursday 00:00-24:00
friday 00:00-24:00
saturday 00:00-24:00
}

vi /usr/local/nagios/etc/objects/templates.cfg
# 定义generic-contact联系人模板,并非真正的联系人,真正的联系人在contacts.cfg中定义
define contact{
name generic-contact ; The name of this contact template
service_notification_period 24×7 ; service notifications can be sent anytime
host_notification_period 24×7 ; host notifications can be sent anytime
service_notification_options w,u,c,r,f,s ; send notifications for all service states, flapping events, and scheduled downtime events
host_notification_options d,u,r,f,s ; send notifications for all host states, flapping events, and scheduled downtime events
service_notification_commands notify-service-by-email ; send service notifications via email
host_notification_commands notify-host-by-email ; send host notifications via email
register 0 ; DONT REGISTER THIS DEFINITION – ITS NOT A REAL CONTACT, JUST A TEMPLATE!
}
——————————————————————————————————————
service_notification_period 24×7
服务出了状况通知的时间段,这个时间段就是上面在timeperiods.cfg中定义的.
host_notification_period 24×7
主机出了状况通知的时间段, 这个时间段就是上面在timeperiods.cfg中定义的
service_notification_options w,u,c,r
当服务出现w-报警(warning),u-未知(unkown),c-严重(critical),或者r-从异常情况恢复正常,在这四种情况下通知联系人.
host_notification_options d,u,r
当主机出现d—–当机(down),u-返回不可达(unreachable),r-从异常情况恢复正常,在这3种情况下通知联系人
service_notification_commands notify-service-by-email
服务出问题通知采用的命令notify-service-by-email,这个命令是在commands.cfg中定义的,作用是给联系人发邮件.
host_notification_commands notify-host-by-email
同上,主机出问题时采用的也是发邮件的方式通知联系人
——————————————————————————————————————

# 定义generic-host主机模板
define host{
name generic-host ; The name of this host template
notifications_enabled 1 ; Host notifications are enabled
event_handler_enabled 1 ; Host event handler is enabled
flap_detection_enabled 1 ; Flap detection is enabled
failure_prediction_enabled 1 ; Failure prediction is enabled
process_perf_data 1 ; Process performance data
retain_status_information 1 ; Retain status information across program restarts
retain_nonstatus_information 1 ; Retain non-status information across program restarts
notification_period 24×7 ; Send host notifications at any time
register 0 ; DONT REGISTER THIS DEFINITION – ITS NOT A REAL HOST, JUST A TEMPLATE!
}

# 定义Linux主机模板
define host{
name linux-server ; The name of this host template
use generic-host ; This template inherits other values from the generic-host template
check_period 24×7 ; By default, Linux hosts are checked round the clock
check_interval 5 ; Actively check the host every 5 minutes
retry_interval 1 ; Schedule host check retries at 1 minute intervals
max_check_attempts 10 ; Check each Linux host 10 times (max)
check_command check-host-alive ; Default command to check Linux hosts
notification_period workhours ; Linux admins hate to be woken up, so we only notify during the day
; Note that the notification_period variable is being overridden from
; the value that is inherited from the generic-host template!
notification_interval 120 ; Resend notifications every 2 hours
notification_options d,u,r ; Only send notifications for specific host states
contact_groups admins ; Notifications get sent to the admins by default
register 0 ; DONT REGISTER THIS DEFINITION – ITS NOT A REAL HOST, JUST A TEMPLATE!
}

# 在nagios.cfg配置文件中开启对/usr/local/nagios/etc/servers/中配置文件的引用。
cfg_dir=/usr/local/nagios/etc/servers

# 远程Linux主机监控文件,如果监控多台主机只需简单复制修改即可。
#我们应该牢记wiki-l-11.cfg用到的命令在commands.cfg中定义,在commands.cfg中定义的命令用到/usr/local/nagios/libexec下的插件(命令)。
vi /usr/local/nagios/etc/servers/wiki-l-11.cfg
# 定义主机
define host{
use linux-server ; Name of host template to use
; This host definition will inherit all variables that are defined
; in (or inherited by) the linux-server host template definition.
host_name wiki
alias Docs
address 192.168.0.11
}
# 定义Ping远程Linux主机
define service{
use generic-service ; Name of service template to use
host_name wiki
service_description PING
check_command check_ping!100.0,20%!500.0,60% ;check_ping命令在commands.cfg中定义,后跟两个参数,命令及参数间用!分割。
}
#检查远程Linux主机根分区使用情况,check_nrpe命令必须在/usr/local/nagios/etc/objects/commands.cfg中定义(默认未定义)
define service{
use generic-service ; Name of service template to use
host_name wiki
service_description Root Partition
check_command check_nrpe!check_disk_root
}
# 检查远程Linux主机的登录人数
define service{
use generic-service ; Name of service template to use
host_name wiki
service_description Current Users
check_command check_nrpe!check_users
}
# 检查远程Linux的主机的负载
define service{
use generic-service ; Name of service template to use
host_name wiki
service_description Current Load
check_command check_nrpe!check_load
}
# 检查远程Linux主机swap分区使用情况
define service{
use generic-service ; Name of service template to use
host_name wiki
service_description Swap Usage
check_command check_nrpe!check_swap
}
# 检查远程Linux主机的SSH服务
define service{
use generic-service ; Name of service template to use
host_name wiki
service_description SSH
check_command check_ssh
notifications_enabled 0
}
# 检查远程Linux主机的HTTP服务
define service{
use generic-service ; Name of service template to use
host_name wiki
service_description HTTP
check_command check_http
notifications_enabled 0
}

vi /usr/local/nagios/etc/servers/hostgroup.cfg
# 定义主机组(localhost.cfg中有类似的主机组设置,我已将其注释掉,否则可能会有冲突)
define hostgroup{
hostgroup_name linux-servers ; The name of the hostgroup
alias Linux Servers ; Long name of the group
members localhost,wiki ; Comma separated list of hosts that belong to this group
}

#define hostgroup{
# hostgroup_name windows-servers ; The name of the hostgroup
# alias Windows Servers ; Long name of the group
# members print ; Comma separated list of hosts that belong to this group
# }

=========================================

# 检查配置文件

/usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg

# 确定无误后重启Nagios:

service nagios restart

使用Nagios监控Windows服务器

12.Nagios使用NSClient++监控远程Windows主机

win-monitor

下载NSClient++-Win32-0.3.5.msi并安装。
到安装目录打开NSC.ini文件进行修改:
在[modules]模块,将除CheckWMI.dll和RemoteConfiguration.dll外的所有dll文件明前的注释(;)去掉。
在 [Settings]模块可以设置一个连接密码password=PWD,为了简单,在此不设密码。设置 allowed_hosts=127.0.0.1/32,192.168.0.19,可以连接的监控服务器的地址,如果写成192.168.0.0/24 则表示该子网内的所有机器都可以访问;如果这个地方是空白则表示所有的主机都可以连接上来(注意在[NSClient]有allowed_hosts的同 样设置,不要设置错了),最后不要忘记去掉前面的注释符(;)。
运行nsclient++
NSClient++ /install
NSClient++ SysTray install
NSClient++ /start
如果有防火墙,请开放相应端口。
创建监控配置文件,使用check_nt命令监控windows系统信息(此命令默认已定义)。

Windows监控示例配置文件:

[root@tech etc]# cat /usr/local/nagios/etc/servers/print-w-80.cfg
###################################################################
# WINDOWS.CFG – SAMPLE CONFIG FILE FOR MONITORING A WINDOWS MACHINE
#
# Last Modified: 06-13-2007
#
# NOTES: This config file assumes that you are using the sample configuration
# files that get installed with the Nagios quickstart guide.
#
####################################################################

####################################################################
####################################################################
#
# HOST DEFINITIONS
#
####################################################################
####################################################################

# Define a host for the Windows machine we’ll be monitoring
# Change the host_name, alias, and address to fit your situation

define host{
use windows-server ; Inherit default values from a template
host_name print80 ; The name we’re giving to this host
alias Print80 ; A longer name associated with the host
address 192.168.0.80 ; IP address of the host
}

####################################################################
####################################################################
#
# HOST GROUP DEFINITIONS
# 主机组在/usr/local/nagios/etc/servers/hostgroup.cfg中单独配置
####################################################################
####################################################################
# Define a hostgroup for Windows machines
# All hosts that use the windows-server template will automatically be a member of this group

#define hostgroup{
# hostgroup_name windows-servers ; The name of the hostgroup
# alias Windows Servers ; Long name of the group
# }
#####################################################################
#####################################################################
#
# SERVICE DEFINITIONS
#
#####################################################################
#####################################################################
# Create a service for monitoring the version of NSCLient++ that is installed
# Change the host_name to match the name of the host you defined above

define service{
use generic-service
host_name print80
service_description NSClient++ Version
check_command check_nt!CLIENTVERSION
}

# Create a service for monitoring the uptime of the server
# Change the host_name to match the name of the host you defined above

define service{
use generic-service
host_name print80
service_description Uptime
check_command check_nt!UPTIME
}

# Create a service for monitoring CPU load
# Change the host_name to match the name of the host you defined above

define service{
use generic-service
host_name print80
service_description CPU Load
check_command check_nt!CPULOAD!-l 5,80,90
}

# Create a service for monitoring
# Change the host_name to match the name of the host you defined above

define service{
use generic-service
host_name print80
service_description Memory Usage
check_command check_nt!MEMUSE!-w 80 -c 90
}

# Create a service for monitoring C:\ disk usage
# Change the host_name to match the name of the host you defined above

define service{
use generic-service
host_name print80
service_description C_Drive_Space
check_command check_nt!USEDDISKSPACE!-l c -w 80 -c 90
}

# Create a service for monitoring the W3SVC service
# Change the host_name to match the name of the host you defined above

define service{
use generic-service
host_name print80
service_description W3SVC
check_command check_nt!SERVICESTATE!-d SHOWALL -l W3SVC
}

# Create a service for monitoring the Explorer.exe process
# Change the host_name to match the name of the host you defined above

define service{
use generic-service
host_name print80
service_description Explorer
check_command check_nt!PROCSTATE!-d SHOWALL -l Explorer.exe
}

#主机组配置文件
[root@tech etc]# cat /usr/local/nagios/etc/servers/hostgroup.cfg
define hostgroup{
hostgroup_name linux-servers ; The name of the hostgroup
alias Linux Servers ; Long name of the group
members localhost,wiki ; Comma separated list of hosts that belong to this group
}

define hostgroup{
hostgroup_name windows-servers ; The name of the hostgroup
alias Windows Servers ; Long name of the group
members print80 ; Comma separated list of hosts that belong to this group
}

使用NDOUtils将Nagios监控信息存入数据库

1.NDOUtils安装需求:
Nagios正确安装运行,版本>=2.0安装目录:/usr/local/nagios
MySQL正确安装并运行,我用的是二进制安装包,安装目录:/usr/local/mysql

ndoutils

2.安装
下载安装包:http://www.nagios.org/download/
# 确认DBD-mysql包安装,如没有安装,请先安装。
# 如果DBI包没安装DBD-mysql前需要先安装DBI包

tar xzvf DBI-1.605.tar.gz
cd DBI-1.605
perl Makefile.PL
make
make install

tar xzvf DBD-mysql-3.0008.tar.gz
cd DBD-mysql-3.0008
unset LANG
perl Makefile.PL –libs=”-L/usr/local/mysql/lib/mysql -lmysqlclient -L/usr/lib -lz ” –cflags=”-I/usr/local/mysql/include” –testhost=127.0.0.1 –mysql_config=/usr/local/mysql/bin/mysql_config –testuser=root –testpassword=baihe.com
make
make install

# 做好MySQL库文件lib和include文件的连接(这点很重要)。

ln -s /usr/local/mysql/include/* /usr/include/
mkdir /usr/include/mysql
ln -s /usr/local/mysql/include/* /usr/include/mysql/
ln -s /usr/local/mysql/lib/* /usr/lib/
mkdir /usr/lib/mysql
ln -s /usr/local/mysql/lib/* /usr/lib/mysql

tar xzvf ndoutils-1.4b7.tar.gz
cd ndoutils-1.4b7
./configure –enable-mysql
# 执行完成后,注意向上看看打印出的信息,看看是否找到了MySQL的库文件和include文件。
make
cd src/
# 我的Nagios是3.0.5,所以拷贝的是ndomod-3x.o、ndo2db-3x,如果你的是2.X.x请拷贝ndomod-2x.o、ndo2db-2x
cp ndomod-3x.o ndo2db-3x log2ndo file2sock /usr/local/nagios/bin
cd ../db
# 创建数据库,使用root用户,密码是baihe.com
mysqladmin -u root -pbaihe.com create nagios
./installdb -u root -p baihe.com -d nagios
# 拷贝配置文件
cd ../config
cp ndo* /usr/local/nagios/etc/
# 修改/usr/local/nagios/etc/ndo2db.cfg文件的数据库信息。
vi /usr/local/nagios/etc/ndo2db.cfg
vi nagios.cfg
# 复制下面内容粘贴到/usr/local/nagios/etc/nagios.cfg配置文件的#broker_module=…下面。
# Uncomment the line below if you’re running Nagios 3.x
broker_module=/usr/local/nagios/bin/ndomod-3x.o config_file=/usr/local/nagios/etc/ndomod.cfg
# 修改/usr/local/nagios/etc/nagios.cfg配置文件下面参数的值为-1(一般默认如此)。
event_broker_options=-1

# 启动ndo2db

/usr/local/nagios/bin/ndo2db-3x -c /usr/local/nagios/etc/ndo2db.cfg

# 查看系统日志是否有错误信息:

tail -30 /var/log/messages
Nov 27 14:09:26 tech nagios: ndomod: NDOMOD 1.4b7 (10-31-2007) Copyright (c) 2005-2007 Ethan Galstad (nagios@nagios.org)
Nov 27 14:09:26 tech nagios: ndomod: Successfully connected to data sink. 0 queued items to flush.
Nov 27 14:09:26 tech nagios: Event broker module ‘/usr/local/nagios/bin/ndomod-3x.o’ initialized successfully.
Nov 27 14:09:26 tech nagios: Finished daemonizing… (New PID=18848)

# 重启Nagios

service nagios restart

# 查看Nagios日志,看是否正常启动。

tail -20 /usr/local/nagios/var/nagios.log
[1227766166] ndomod: NDOMOD 1.4b7 (10-31-2007) Copyright (c) 2005-2007 Ethan Galstad (nagios@nagios.org)
[1227766166] ndomod: Successfully connected to data sink. 0 queued items to flush.
[1227766166] Event broker module ‘/usr/local/nagios/bin/ndomod-3x.o’ initialized successfully.
[1227766166] Finished daemonizing… (New PID=18848)

3.如何用NDOUtils将多个Nagios实例的性能数据写入一个数据库?

nagios-to-db

配置方法类似于上面的单实例单数据库模式,只是在装第二个实例时省去安装DB一步,并需要修改/usr/local/nagios/etc/ndomod.cfg中的实例名。
实例名默认为instance_name=default,如果有多个实例必需修改成不同的实例名。
如:instance_name=nagios1
注意修改/usr/local/nagios/etc/ndo2db.cfg中数据库连接的相关配置。

安装使用Nagvis插件

1.安装需求:
1).Nagios已正常运行。
2).使用NDOUtils将Nagios监控信息存入数据库(见NDOUtils安装)。
3).PHP5.0以上版本,并需要安装php-gd、php-mysql、php5-mbstring、php5-session、php5-xml模块。
4). 安装Graphviz >= 2.14,下载http://www.graphviz.org/graphviz-rhel.repo文件放到/etc/yum.repos.d/目录 下,yum list available ‘graphviz*’;yum install ‘graphviz*’或者安装webdot:yum install ‘webdot’。

2.Nagvis安装
1).下载Nagivs,http://www.nagvis.org。
2).tar xvzf nagvis-1.3.x.tar.gz
3).mv nagvis /usr/local/nagios/share
4).配置nagvis

cd /usr/local/nagios/share/nagvis
cp etc/nagvis.ini.php-sample etc/nagvis.ini.php
vi etc/nagvis.ini.php

在[paths]模块修改URL path。
在[backend_ndomy_1]模块修改连接数据库的一下信息(如用户名、密码等)。
注意:修改过的参数需要去掉前面的分号注释符。
5).将/usr/local/nagios/share/nagvis设为运行apache的用户(daemon)所有。

chown daemon:nagios /usr/local/nagios/share/nagvis -R
chmod 664 /usr/local/nagios/share/nagvis/etc/nagvis.ini.php
chmod 775 /usr/local/nagios/share/nagvis/nagvis/p_w_picpaths/maps
chmod 664 /usr/local/nagios/share/nagvis/nagvis/p_w_picpaths/maps/*
chmod 775 /usr/local/nagios/share/nagvis/etc/maps
chmod 664 /usr/local/nagios/share/nagvis/etc/maps/*
chmod 775 /usr/local/nagios/share/nagvis/var
chmod 664 /usr/local/nagios/share/nagvis/var/*

3.访问图形配置工具:http://<nagiosserver>/<path-to-nagvis>/config.php
4.访问监控图: http://<nagiosserver>/<path-to-nagvis>/index.php?map=<mapname>

nagvis-img

nagvis-img1

baihe1

5.Nagivs如何使用多个Nagios实例的性能数据作图?
《如何用NDOUtils将多个Nagios实例的性能数据写入一个数据库》见NDOUtils的安装。
修改/usr/local/nagios/share/nagvis/etc/nagvis.ini.php,定义多个backend_ndomy_x

[backend_ndomy_2]
; type of backend – MUST be set
;backendtype=”ndomy”
; hostname for NDO-db
;dbhost=”localhost”
; portname for NDO-db
;dbport=3306
; database-name for NDO-db
;dbname=”nagios”
; username for NDO-db
;dbuser=”root”
; password for NDO-db
dbpass=”111111″
; prefix for tables in NDO-db
;dbprefix=”nagios_”
; instace-name for tables in NDO-db
dbinstancename=”nagios1″
; maximum delay of the NDO Database in Seconds
;maxtimewithoutupdate=180
; path to the cgi-bin of this backend
;htmlcgi=”/nagios/cgi-bin”

注意上面的数据库连接设置和dbinstancename设置。
在作图时如需引用nagios1实例的性能数据,backend_id栏需要选择ndomy_2。

nagvis

Nagios使用飞信短信报警

1.下载安装飞信机器人
下载地址:http://www.it-adv.net/
注意:ReaHat和CentOS用户应使用LINUX X86/32(REDHAT ES4X32)版。
其中支持库和安装包内容如下:
支持库:libraryrh4×32.tar.gz
最新飞信机器人为fetion20080522004-linrh4.tar.gz

cd /usr/local
# 将飞信解压到/usr/local下
tar zxvf fetion20080522004-linrh4.tar.gz

mv install fetion
# 创建飞信支持库目录
mkdir /usr/local/fetion/lib
tar zxvf libraryrh4×32.tar.gz
cd libraryrh4×32
cp lib*so* /usr/local/fetion/lib
ln -s /usr/local/fetion/lib/libcrypto.so.0.9.7a /usr/local/fetion/lib/libcrypto.so.4
ln -s /usr/local/fetion/lib/libssl.so.0.9.7a /usr/local/fetion/lib/libssl.so.4
vi /etc/ld.so.conf
# 加入下面行
/usr/local/fetion/lib
# 保存退出,并执行ldconfig命令使配置生效。

2.测试安装是否成功

[root@web74 fetion]# ./fetion
************************ IMPORTANT STATEMENT ************************
** **
** PLEASE DON’T USE THIS SOFTWARE TO SEND JUNK SHORT MESSAGES. **
** OTHERWISE PLEASE BEAR YOUR OWN CONSEQUENCES. **
** **
** Version:[20080522004-linrh4] **
*********************************************************************
This program is the console version of China Fetion!
It’s free for personal user.
Fetion official website: http://www.fetion.com.cn/
This project website: http://www.it-adv.net/

AUTHOR:KelvinH MSN/EMAIL:shichangguo@msn.com

Usage:
fetion -h
-h: help
fetion -u mobile -p pwd [-b batchfile] [-EN] [-d]
fetion -u mobile -p pwd [-b batchfile] [-EN] [-d]
-u: Fetion user account(only supports mobile phone No.)
-p: Account password
-b: Batch file name
-d: Debug on and write logs to [mobile]-debug.log
-EN: English

3.编写发送脚本

vi /usr/local/fetion/sendsms.sh
#!/bin/sh
fetionDir=/usr/local/fetion
cd $fetionDir
DIR=`pwd`
# 设置发短信的号码和飞信登录密码
user=158xxxxxxxx
pwd=xxxxxx

for phone in `cat $DIR/phonelist.txt`
do
echo “$phone” | sed ‘/^[ \t]*$/d’ | sed ’s/^[ \t]*//’ | sed ’s/[ \t]*$//’ | grep ‘^1[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]‘
if (($? == 0 ));then
if [[ -f $DIR/msg.txt ]];then
cat /dev/null > msg.txt
fi
phone=`echo “$phone” | sed ’s/^[ \t]*//’ | sed ’s/[ \t]*$//’`
echo “sms $phone $1″ >> $DIR/msg.txt
echo “quit” >> $DIR/msg.txt
$fetionDir/fetion -u $user -p $pwd -b $DIR/msg.txt
else
continue
fi
done

# phonelist.txt为接收短信号码列表,必须是发送飞信用户的好友
# 在此控制短信接收者而不是在nagios联系人中,好处是修改接收者后不需要重启nagios
# 可以使用#号注释接收者的号码以屏蔽接收者

vi /usr/local/nagios/fetion/phonelist.txt
# Phone NO. 1
138xxxxxxxx
# Phone NO. 2
158xxxxxxxx

# 修改飞信目录权限(否则运行Nagios的用户不能写入信息内容到/usr/local/fetion/msg.txt)

chown -R nagios.nagios /usr/local/fetion
chmod +x /usr/local/fetion/sendsms.sh
chmod +x /usr/local/fetion/fetion

4.配置Nagios

vi /usr/local/nagios/etc/objects/commands.cfg
#################################################################
#
# notify-host-by-sendmsg/notify-service-by-sendmsg //定义发送飞信报警的命令
#
#################################################################
define command {
command_name notify-host-by-sendmsg
command_line /usr/local/fetion/sendsms.sh “Host $HOSTSTATE$ alert for $HOSTNAME$($HOSTADDRESS$) on $TIME$.”
}

define command {
command_name notify-service-by-sendmsg
command_line /usr/local/fetion/sendsms.sh “”$TIMEtiny_mce_markerquot;:$SERVICEDESC$($HOSTADDRESS$) is $SERVICESTATE$.”
}

定义联系人模板
define contact{
name sendmsg-contact ; The name of this contact template
service_notification_period 24×7 ; service notifications can be sent anytime
host_notification_period 24×7 ; host notifications can be sent anytime
service_notification_options w,u,c,s ; send notifications for all service states, flapping events, and scheduled downtime events
host_notification_options d,u,s ; send notifications for all host states, flapping events, and scheduled downtime events
service_notification_commands notify-service-by-sendmsg ; 使用定义的飞信报警命令
host_notification_commands notify-host-by-sendmsg ; 使用定义的飞信报警命令
register 0 ; DONT REGISTER THIS DEFINITION – ITS NOT A REAL CONTACT, JUST A TEMPLATE!
}

定义联系人及联系人组
vi /usr/local/nagios/etc/objects/contacts.cfg
define contact{
contact_name msgreceiver ; Short name of user
use sendmsg-contact ; 使用上面定义的联系人模板
alias MsgReceiver ; Full name of user

email alert@xxxxxx.com
}
define contactgroup{
contactgroup_name admins-sendmsg
alias BaiHe Monitor
members msgreceiver
}

5.测试故障时是否能够触发短信报警

Nagios事件处理

我们可以使用事件处理来在任何人收到通知之前由Nagios来做一些前期故障修复。
事件处理会在下面情况触发:
1).主机或服务处于一个软态故障状态时
2).主机或服务初始进入一个硬态故障时
3).主机或服务从软态或硬态的故障状态中初始恢复时
通 过在主配置文件(nagios.cfg)中设置enable_event_handlers=1来打开全局事件处理,特定主机的和服务的事件处理可用主机 和服务对象里的event_handler_enabled域来开关。如果全局的enable_event_handlers域是关闭的,那么特定主机的 和服务的事件处理也不会运行。
事件处理命令可以用shell或是perl脚本,脚本中应该处理以下宏:
对服务的:$SERVICESTATE$、$SERVICESTATETYPE$和$SERVICEATTEMPT$;
对主机的:$HOSTSTATE$、$HOSTSTATETYPE$和$HOSTATTEMPT$。
脚本须检测这些作为命令行参数传入的值,并采取必要动作来处理这些值。
事 件处理命令通常是与运行于本机上的Nagios程序的权限是相同的(下面例子中Nagios服务是以nagios用户运行的)。这可能会有问题,如果你想 写成一个用于系统服务重启的命令,它需要有root权限才能执行一系列命令与任务。你或许会尝使用sudo命令来实现它。
本例通过Nagios检测远程机器上的MySQL服务,当服务出现问题时通过Nagios的事件处理逻辑来重启远程机器上的MySQL服务。

1.配置在Nagios服务器(192.168.0.200)上无密码登录远程机器(MySQL服务运行在上面-192.168.0.210)

[root@nagios ~]# su – nagios
[nagios@nagios ~]$ ssh-keygen -t rsa
# 下面一直回车,不要设置密码
Generating public/private rsa key pair.
Enter file in which to save the key (/home/nagios/.ssh/id_rsa):
Enter passphrase (empty for no passphrase):
Enter same passphrase again:
Your identification has been saved in /home/nagios/.ssh/id_rsa.
Your public key has been saved in /home/nagios/.ssh/id_rsa.pub.
The key fingerprint is:
d2:82:61:12:53:f9:53:75:77:8d:32:c0:ca:c8:20:60 nagios@nagios.itech.com

# 将生成的密钥拷贝到要远程登录的机器上
[nagios@nagios ~]$ scp .ssh/id_rsa.pub 192.168.0.210:/home/nagios/
nagios@192.168.0.210’s password:
id_rsa.pub 100% 233 0.2KB/s 00:00

# 在要远程登录的机器上配置公钥
[nagios@nagios ~]$ ssh nagios@192.168.0.210
nagios@192.168.0.210’s password:
Last login: Sat Nov 29 22:30:55 2008 from 192.168.0.200
[nagios@nagios1 ~]$ cat id_rsa.pub >> .ssh/authorized_keys
[nagios@nagios1 ~]$ chmod 600 .ssh/authorized_keys
[nagios@nagios1 ~]$ exit
logout
Connection to 192.168.0.210 closed.

# 测试无密码登录

[nagios@nagios ~]$ ssh nagios@192.168.0.210
Last login: Sat Nov 29 22:35:27 2008 from 192.168.0.200

2.在远程机器上配置sudo

使nagios用户可以以root身份运行/usr/local/nagios/libexec/eventhandlers/restart-mysql脚本

[root@MySQL ~]# visudo
nagios ALL=(root) NOPASSWD:/usr/local/nagios/libexec/eventhandlers/restart-mysql

3.在远程机器上编写MySQL重启脚本

[root@MySQL ~]# vi /usr/local/nagios/libexec/eventhandlers/restart-mysql
#!/bin/sh
#
# Event handler script for restarting the MySQL server on the remote machine
#
# Note: This script will only restart the MySQL server if the service is
# retried 2 times (in a “soft” state) or if the web service somehow
# manages to fall into a “hard” error state.
#
#
# What state is the MySQL service in?
case “$1″ in
OK)
;;
WARNING)
;;
UNKNOWN)
;;
CRITICAL)
# Is this a “soft” or a “hard” state?
case “$2″ in

SOFT)

# What check attempt are we on? We don’t want to restart the MySQL server on the first
# check, because it may just be a fluke!
case “$3″ in

2)
echo -n “Restarting MySQL service…”
/sbin/service mysqld restart
;;
esac
;;

HARD)
echo -n “Restarting MySQL service…”
/sbin/service mysqld restart
;;
esac
;;
esac
exit 0

上面的脚本只会在MySQL处于软状态,且第二次检查出现故障时或者进入硬状态时重启MySQL。

4.配置Nagios服务器上的配置文件

[root@nagios ~]# cd /usr/local/nagios/etc/

# 检查全局事件处理是否打开

[root@nagios etc]# vi nagios.cfg
enable_event_handlers=1

# 在命令配置文件中定义重启MySQL的命令
[root@nagios etc]# vi objects/commands.cfg
# restart the service on a remote server
define command{
command_name restart-mysql
command_line /usr/bin/ssh nagios@$HOSTADDRESS$ “sudo /usr/local/nagios/libexec/eventhandlers/restart-mysql $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPTtiny_mce_markerquot;
}

# 配置主机监控文件
[root@nagios etc]# vi servers/mysql.cfg
# 省略主机定义和其他服务定义
define service{
use generic-service ; Name of service template to use
host_name MySQL
service_description MySQL
check_command check_nrpe!check_mysql
notifications_enabled 1
event_handler_enabled 1
event_handler restart-mysql
}

这个脚本理论上在服务转入硬态故障之前可以重启MySQL服务以修复故障,这里包含了首次重启没有成功的情况。须注意的是事件处理将只是第一次进入硬态紧急状态时才会被触发,这将阻止Nagios在服务一直处于硬态故障状态时反复地重启MySQL服务。