通过nagios监控总出口流量异常时触发抓包行为20130515
by lai
原理:nagios通过mrtg监控交换机上联端口流量,上联端口的in、out流量镜像到连接网管机的交换机端口,nagios检测到流量超出设定值,通过envent handler远程触发网管机的抓包脚本,并记录log,发送报警邮件,然后通过wireshark、tcpdump等分析抓到的数据包。
1、nagios安装mrtg,这里不详细介绍了
#yum install mrtg
设置mrtg每分钟取一次数据
# more /etc/cron.d/mrtg
*/1 * * * * root LANG=C LC_ALL=C /usr/bin/mrtg /etc/mrtg/mrtg.cfg --lock-file /var/lock/mrtg/mrtg_l --confcache-file /var/lib/mrtg/mrtg.ok
生成的mrtg图像在/var/www/mrtg/目录
2、定义命令
# more /usr/local/nagios/etc/objects/commands.cfg
###########################################################
# Define auto capture packets command
###########################################################
define command{
command_name auto-capture-packet-eh
#command_line /usr/local/nagios/libexec/eventhandlers/auto-capture-packet-eh.sh $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$ $HOSTADDRESS$ $HOSTDOWNTIME$ $SERVICEDOWNTIME$
command_line /usr/local/nagios/libexec/eventhandlers/auto-capture-packet-eh.sh $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$ 192.168.0.19 0 0
}
3、监控机定义交换机上联端口,设置in/out流量的报警值(交换机的端口ID需要通过snmpwalk获取)
# more /usr/local/nagios/etc/objects/wby_nagios/gateway_switch.cfg
# Define the switch that we'll be monitoring
define host{
use generic-switch ; Inherit default values from a template
host_name gateway_switch_h3c254 ; The name we're giving to this switch
alias gateway_switch_h3c254 ; A longer name associated with the switch
address 192.168.0.254 ; IP address of the switch
hostgroups switches ; Host groups this switch is associated with
}
# Monitor bandwidth via MRTG logs
define service{
use generic-service ; Inherit values from a template
host_name gateway_switch_h3c254
service_description Port 24 uplink_port Bandwidth Usage
check_command check_local_mrtgtraf!/var/www/mrtg/192.168.0.254_3462.log!AVG!2000000,2000000!3000000,3000000!3
max_check_attempts 4
event_handler auto-capture-packet-eh
}
4、把交换机上联端口的所有流量镜像到连接网管机的网卡的端口
5、设置从nagios监控机到抓包网管机的ssh信任,以便在抓包条件触发时,监控机不需要输入密码即可执行网管机上的抓包脚本(监控机nagios-->网管机nagios)
网管机nagios设置ssh信任
useradd nagios
mkdir /home/nagios/.ssh/
vim /home/nagios/.ssh/authorized_keys
chmod 700 /home/nagios/.ssh/
chown nagios.nagios /home/nagios/ -R
6、网管机nagios用户增加sudo 权限
# more /etc/sudoers
# Nagios commands
Cmnd_Alias SERVICE=/etc/init.d/sendmail
User_Alias NAGIOSUSERS = nagios
NAGIOSUSERS ALL = NOPASSWD: SERVICE
nagios ALL=(root) NOPASSWD:/usr/local/nagios/libexec/eventhandlers/auto-capture-packet.sh
如果报错:sudo: sorry, you must have a tty to run sudo
需要注释掉/etc/sudoers requiretty行
#Defaults requiretty
7、nagiso监控机上的事件控制脚本,需要可执行权限
# more /usr/local/nagios/libexec/eventhandlers/auto-capture-packet-eh.sh
#!/bin/sh
#
# Event handler script for restarting the apache (httpd) on the remote machine
#
# Matthew Harman May 2012 - matthew@harmanweb.co.uk
#
# Note: This script will only restart if the service is
# retried 3 times (in a "soft" state) or if the service somehow
# manages to fall into a "hard" error state.
#
# What state is the httpd check in?
case "$1" in
OK)
# The service just came back up, so don't do anything...
;;
WARNING)
# We don't really care about warning states, since the service is
# probably still running...
;;
UNKNOWN)
# We don't know what might be causing an unknown error, so don't do
# anything...
;;
CRITICAL)
# Aha! The service appears to have a problem - perhaps we should
# restart the server...
# Is this a "soft" or a "hard" state?
case "$2" in
# We're in a "soft" state, meaning that Nagios is in the middle
# of retrying the check before it turns into a "hard" state and
# contacts get notified...
SOFT)
# What check attempt are we on? We don't want to
# restart the web server on the first check, because
# it may just be a fluke!
case "$3" in
# Wait until the check has been tried 3 times
# before restarting the web server. If the
# check fails on the 4th time (after we restart
# the web server), the state type will turn to
# "hard" and contacts will be notified of the
# problem.
# Hopefully this will restart the web server
# successfully, so the 4th check will result
# in a "soft" recovery. If that happens no one
# gets notified because we fixed the problem!
3)
# Check if the host or service is in a
# period of downtime
servicestatus="$5""$6";
case "$servicestatus" in
00)
echo -n "Restarting service (3rd soft critical state)..."
# Call the script to restart the process
ssh -f -T "$4" /usr/local/nagios/libexec/eventhandlers/auto-capture-packet1.sh &
echo "总出口流量异常,已连续3分钟超过警戒值,系统已自动抓包保存,请检查"|mutt -s "Waring:总出口流量异常$(date +%Y%m%d%H%M)" 362560**@q
q.com,lai***@126.com
#sleep 20
#kill -9 `ps -ef |grep ssh|grep "$4"|grep "remount-httpd-eh.sh"|awk -F" " '{print $2}'`
#echo `date` >>/home/nagios/tcpdump.log
#/usr/bin/sudo /usr/sbin/tcpdump -c 100 -w /home/nagios/$(date +%Y%m%d%H%M)_tcpdump.cap
;;
esac
;;
esac
;;
# The service somehow managed to turn into a hard error
# without getting fixed. It should have been restarted by the
# code above, but for some reason it didn't. Lets give it one
# last try, shall we?
# Note: Contacts have already been notified of a problem with
# the service at this point (unless you disabled notifications
# for this service)
HARD)
# Check if the host or service is in a period of downtime
servicestatus="$5""$6";
case "$servicestatus" in
00)
echo -n "Restarting Service..."
# Call the script to restart the server
ssh -f -T "$4" /usr/local/nagios/libexec/eventhandlers/auto-capture-packet2.sh &
echo "总出口流量异常,已连续4分钟超过警戒值,系统已自动抓包保存,请检查"|mutt -s "Critical:总出口流量异常$(date +%Y%m%d%H%M)" 362560**@qq.co
m,lai***@126.com
#sleep 20
#kill -9 `ps -ef |grep ssh|grep "$4"|grep "remount-httpd-eh.sh"|awk -F" " '{print $2}'`
#echo `date +%Y%m%d%H%M` >>/home/nagios/tcpdump.log
#/usr/bin/sudo /usr/sbin/tcpdump -c 100 -w /home/nagios/tcpdump_2.cap
#ssh -f -T "$4" /usr/bin/sudo /usr/sbin/tcpdump -c 100 -w /home/nagios/$(date +%Y%m%d%H%M)_tcpdump_2.cap
;;
esac
;;
esac
;;
esac
exit 0
8、网管机上的执行抓包的脚本,需要可执行权限
# more /usr/local/nagios/libexec/eventhandlers/auto-capture-packet.sh
#!/bin/sh
#sudo /etc/init.d/sendmail restart
echo -n "tcpdump start at:"`date`" " >>/home/nagios/log.txt
/usr/bin/sudo /usr/sbin/tcpdump -i eth1 -c 2000000 -w /home/nagios/$(date +%Y%m%d%H%M)_tcpdump_eth1.cap
/bin/gzip /home/nagios/$(date +%Y%m%d%H%M)_tcpdump_eth1.cap
echo "end at:"`date` >>/home/nagios/log.txt
9、模拟测试:
/usr/local/nagios/libexec/eventhandlers/auto-capture-packet-eh.sh CRITICAL HARD 0 192.168.0.19 0 0
10、实际测试
ok