公司服务器越来越多了,本来用一个脚本去检测了,现在改用Nagios


ubuntu 客户端安装脚本

#!/bin/bash
tmp_dir=/tmp/nagios
nagios_ser="192.168.1.3"
groupadd nagios   
useradd -g nagios -s /sbin/nologin nagios
if [ ! -d $tmp_dir ]; then
    mkdir $tmp_dir
fi
cd $tmp_dir
wget http://downloads.sourceforge.net/project/nagios/nrpe-2.x/nrpe-2.15/nrpe-2.15.tar.gz
wget http://nagios-plugins.org/download/nagios-plugins-2.0.1.tar.gz
#---- install
for i in `ls -1`
    do tar xf $i
done
apt-get -y --force-yes install openssl ruby1.9.1 build-essential
apt-get -y --force-yes install libssl-dev lm-sensors
tar xvf nagios-plugins-2.0.1.tar.gz
cd nagios-plugins-2.0.1
./configure --with-nagios-user=nagios --with-nagios-group=nagios
make    
make install
cd ../
tar xvf nrpe-2.15.tar.gz
cd ./nrpe-2.15
./configure --with-ssl-lib=/usr/lib/x86_64-linux-gnu
make all    
make install-plugin    
make install-daemon    
make install-daemon-config
#mv ./check_* /usr/local/nagios/libexec
#chmod 755 -R /usr/local/nagios/libexec
chown -R nagios:nagios /usr/local/nagios/
cat >/usr/local/nagios/etc/nrpe.cfg<<EOF
log_facility=daemon
pid_file=/var/run/nrpe.pid
server_port=5666
nrpe_user=nagios
nrpe_group=nagios
allowed_hosts=127.0.0.1,$nagios_ser
                                             
dont_blame_nrpe=0
allow_bash_command_substitution=0
debug=0
command_timeout=60
connection_timeout=300
command[check_users]=/usr/local/nagios/libexec/check_users -w 5 -c 10
command[check_load]=/usr/local/nagios/libexec/check_load -w 15,10,5 -c 30,25,20
command[check_zombie_procs]=/usr/local/nagios/libexec/check_procs -w 5 -c 10 -s Z
command[check_total_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 200
command[check_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 200
command[check_alldisk]=/usr/local/nagios/libexec/check_alldisk -w 90 -c 95
command[check_http]=/usr/local/nagios/libexec/check_http -H 127.0.0.1 -w 5 -c 10 
command[check_ping]=/usr/local/nagios/libexec/check_ping -H 127.0.0.1 -w 3000.0,80% -c 5000.0,100% -p 5 
command[check_ssh]=/usr/local/nagios/libexec/check_ssh -4 127.0.0.1 
command[check_swap]=/usr/local/nagios/libexec/check_swap  -w 30% -c 10%
command[check_sensors]=/usr/local/nagios/libexec/check_sensors
command[check_mdadm]=/usr/local/nagios/libexec/check_mdadm
command[check_smart]=/usr/local/nagios/libexec/check_smart
command[check_drbd]=/usr/local/nagios/libexec/check_drbd
EOF
echo "/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d" >> /etc/rc.local
/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
rm -rf $tmp_dir



自己折腾的ruby脚本,

1:check_smart 磁盘状态检测

#!/usr/bin/env ruby
#0 ok; 1 warning; 2 critical; 3 unknown
#echo "nagios ALL=NOPASSWD:/usr/sbin/smartctl" >>/etc/sudoers
#CentOS sed -i "s:Defaults  requiretty:Defaults:nagios !requiretty:" /etc/sudoers
#调用 check_nrpe!check_smart
health = ""
`ls -1 /dev/sd[a-z]* | grep [a-z]$`.split.each do |hdd|
  status = `sudo /usr/sbin/smartctl -H #{hdd} | grep result | awk -F: '{print $2}'`
  if status.match(/PASSED/)
    health = health + hdd + "  OK\n"
  else
    health = health + hdd + "  Fail\n"
  end
end
if health.include? "Fail"
        puts health
        exit 2
end
puts health
exit 0

2:check_mdadm 软阵列检测

#!/usr/bin/env ruby
#0 ok; 1 warning; 2 critical; 3 unknown
status = `cat /proc/mdstat`
if status.scan('U').size == status.scan('md').size * 2
    puts "Soft Raid OK"
    exit 0
else
    puts "Soft Raid Fail"
    exit 2
end

3:check_drbd DRBD检测

#!/usr/bin/ruby
#0 ok; 1 warning; 2 critical; 3 unknown
if `cat /proc/drbd`.scan("UpToDate").count == `ls -la /dev/ | grep ^b | grep drbd | wc -l`.to_i * 2
    puts "DRBD OK"
    exit 0
else
    puts "DRBD Critical"
    exit 2
end

4:check_alldisk 检测磁盘空间

#!/usr/bin/env ruby
#ARGV[1] min ,ARGV[3] max
# -w 90 -c 95
#0 ok; 1 warning; 2 critical; 3 unknown
space = ''
status = `df -hl -x tmpfs -x devtmpfs | grep -v ^Filesystem`.split
if status.size < 6 #unkown
    puts "UNKOWN"
    exit 3
end
(status.size / 6).times do |x|
    current_use, min_use, max_use = status[4 + x * 6][0..-2].to_i, ARGV[1].to_i, ARGV[3].to_i
    if  current_use > max_use #critical
        space = space + status[x * 6] + "  " + status[4 + x * 6] +  "  " + status[5 + x * 6] +"  Critical\n"
    elsif current_use > min_use and current_use <= max_use #warning
        space = space + status[x * 6] + "  " + status[4 + x * 6] +  "  " + status[5 + x * 6] + "  Warning\n"
    elsif  current_use <= min_use #ok
        space = space + status[x * 6] + "  " + status[4 + x * 6] +  "  " + status[5 + x * 6] + "  OK\n"
    end
end
if space.include?("Crtitical")
    puts space
    exit 2
elsif space.include?("Warning")
    puts space
    exit 1
else
    puts space
    exit 0
end


服务器安装参考