通过nagios监控网络总出口流量异常时自动触发抓包行为20130515

通过nagios监控总出口流量异常时触发抓包行为20130515

by lai

原理:nagios通过mrtg监控交换机上联端口流量,上联端口的in、out流量镜像到连接网管机的交换机端口,nagios检测到流量超出设定值,通过envent handler远程触发网管机的抓包脚本,并记录log,发送报警邮件,然后通过wireshark、tcpdump等分析抓到的数据包。

1、nagios安装mrtg,这里不详细介绍了
#yum install mrtg

设置mrtg每分钟取一次数据
# more /etc/cron.d/mrtg
*/1 * * * * root LANG=C LC_ALL=C /usr/bin/mrtg /etc/mrtg/mrtg.cfg --lock-file /var/lock/mrtg/mrtg_l --confcache-file /var/lib/mrtg/mrtg.ok

生成的mrtg图像在/var/www/mrtg/目录


2、定义命令
# more /usr/local/nagios/etc/objects/commands.cfg
###########################################################
# Define auto capture packets command
###########################################################
define command{
        command_name auto-capture-packet-eh
        #command_line /usr/local/nagios/libexec/eventhandlers/auto-capture-packet-eh.sh $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$ $HOSTADDRESS$ $HOSTDOWNTIME$ $SERVICEDOWNTIME$
command_line /usr/local/nagios/libexec/eventhandlers/auto-capture-packet-eh.sh $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$ 192.168.0.19 0 0
}


3、监控机定义交换机上联端口,设置in/out流量的报警值(交换机的端口ID需要通过snmpwalk获取)
# more /usr/local/nagios/etc/objects/wby_nagios/gateway_switch.cfg
# Define the switch that we'll be monitoring
define host{
        use             generic-switch          ; Inherit default values from a template
        host_name       gateway_switch_h3c254           ; The name we're giving to this switch
        alias           gateway_switch_h3c254   ; A longer name associated with the switch
        address         192.168.0.254           ; IP address of the switch
        hostgroups      switches                ; Host groups this switch is associated with
        }

# Monitor bandwidth via MRTG logs
define service{
        use                     generic-service ; Inherit values from a template
        host_name               gateway_switch_h3c254
        service_description     Port 24 uplink_port Bandwidth Usage
        check_command           check_local_mrtgtraf!/var/www/mrtg/192.168.0.254_3462.log!AVG!2000000,2000000!3000000,3000000!3
        max_check_attempts      4
        event_handler           auto-capture-packet-eh
        }

4、把交换机上联端口的所有流量镜像到连接网管机的网卡的端口


5、设置从nagios监控机到抓包网管机的ssh信任,以便在抓包条件触发时,监控机不需要输入密码即可执行网管机上的抓包脚本(监控机nagios-->网管机nagios)
网管机nagios设置ssh信任
useradd nagios
mkdir /home/nagios/.ssh/
vim /home/nagios/.ssh/authorized_keys
chmod 700 /home/nagios/.ssh/
chown nagios.nagios /home/nagios/ -R


6、网管机nagios用户增加sudo 权限
# more /etc/sudoers
# Nagios commands
Cmnd_Alias SERVICE=/etc/init.d/sendmail
User_Alias NAGIOSUSERS = nagios
NAGIOSUSERS ALL = NOPASSWD: SERVICE
nagios ALL=(root) NOPASSWD:/usr/local/nagios/libexec/eventhandlers/auto-capture-packet.sh


如果报错:sudo: sorry, you must have a tty to run sudo
需要注释掉/etc/sudoers requiretty行
#Defaults    requiretty


7、nagiso监控机上的事件控制脚本,需要可执行权限
# more /usr/local/nagios/libexec/eventhandlers/auto-capture-packet-eh.sh
#!/bin/sh
#
# Event handler script for restarting the apache (httpd) on the remote machine
#
# Matthew Harman May 2012 - matthew@harmanweb.co.uk
#
# Note: This script will only restart if the service is
# retried 3 times (in a "soft" state) or if the service somehow
# manages to fall into a "hard" error state.
#
# What state is the httpd check in?
case "$1" in
   OK)
      # The service just came back up, so don't do anything...
      ;;
   WARNING)
      # We don't really care about warning states, since the service is
      # probably still running...
      ;;
   UNKNOWN)
      # We don't know what might be causing an unknown error, so don't do
      # anything...
      ;;
   CRITICAL)
      # Aha! The service appears to have a problem - perhaps we should
      # restart the server...
      # Is this a "soft" or a "hard" state?
      case "$2" in
         # We're in a "soft" state, meaning that Nagios is in the middle
         # of retrying the check before it turns into a "hard" state and
         # contacts get notified...
         SOFT)
            # What check attempt are we on? We don't want to
            # restart the web server on the first check, because
            # it may just be a fluke!
            case "$3" in
               # Wait until the check has been tried 3 times
               # before restarting the web server. If the
               # check fails on the 4th time (after we restart
               # the web server), the state type will turn to
               # "hard" and contacts will be notified of the
               # problem.
               # Hopefully this will restart the web server
               # successfully, so the 4th check will result
               # in a "soft" recovery. If that happens no one
               # gets notified because we fixed the problem!
               3)
                  # Check if the host or service is in a
                  # period of downtime
                  servicestatus="$5""$6";
                  case "$servicestatus" in
                     00)
                        echo -n "Restarting service (3rd soft critical state)..."
                        # Call the script to restart the process
                        ssh -f -T "$4" /usr/local/nagios/libexec/eventhandlers/auto-capture-packet1.sh &
                        echo "总出口流量异常,已连续3分钟超过警戒值,系统已自动抓包保存,请检查"|mutt -s "Waring:总出口流量异常$(date +%Y%m%d%H%M)" 362560**@q
q.com,lai***@126.com
                        #sleep 20
                        #kill -9 `ps -ef |grep ssh|grep "$4"|grep "remount-httpd-eh.sh"|awk -F" " '{print $2}'`
                        #echo `date` >>/home/nagios/tcpdump.log
                        #/usr/bin/sudo /usr/sbin/tcpdump -c 100 -w /home/nagios/$(date +%Y%m%d%H%M)_tcpdump.cap
                        ;;
                  esac
                  ;;
            esac
            ;;
            # The service somehow managed to turn into a hard error
            # without getting fixed. It should have been restarted by the
            # code above, but for some reason it didn't. Lets give it one
            # last try, shall we?
            # Note: Contacts have already been notified of a problem with
            # the service at this point (unless you disabled notifications
            # for this service)
         HARD)
            # Check if the host or service is in a period of downtime
            servicestatus="$5""$6";
            case "$servicestatus" in
               00)
                  echo -n "Restarting Service..."
                  # Call the script to restart the server
                  ssh -f -T "$4" /usr/local/nagios/libexec/eventhandlers/auto-capture-packet2.sh &
                  echo "总出口流量异常,已连续4分钟超过警戒值,系统已自动抓包保存,请检查"|mutt -s "Critical:总出口流量异常$(date +%Y%m%d%H%M)" 362560**@qq.co
m,lai***@126.com
                  #sleep 20
                  #kill -9 `ps -ef |grep ssh|grep "$4"|grep "remount-httpd-eh.sh"|awk -F" " '{print $2}'`
                  #echo `date +%Y%m%d%H%M` >>/home/nagios/tcpdump.log
                  #/usr/bin/sudo /usr/sbin/tcpdump -c 100 -w /home/nagios/tcpdump_2.cap
                  #ssh -f -T "$4"       /usr/bin/sudo /usr/sbin/tcpdump -c 100 -w /home/nagios/$(date +%Y%m%d%H%M)_tcpdump_2.cap
                  ;;
            esac
            ;;
      esac
      ;;
esac
exit 0

 


8、网管机上的执行抓包的脚本,需要可执行权限
# more  /usr/local/nagios/libexec/eventhandlers/auto-capture-packet.sh   
#!/bin/sh
#sudo /etc/init.d/sendmail restart
echo -n "tcpdump start at:"`date`"  " >>/home/nagios/log.txt
/usr/bin/sudo /usr/sbin/tcpdump  -i eth1 -c 2000000 -w /home/nagios/$(date +%Y%m%d%H%M)_tcpdump_eth1.cap
/bin/gzip /home/nagios/$(date +%Y%m%d%H%M)_tcpdump_eth1.cap
echo "end at:"`date` >>/home/nagios/log.txt

 

9、模拟测试:
/usr/local/nagios/libexec/eventhandlers/auto-capture-packet-eh.sh  CRITICAL HARD 0 192.168.0.19 0 0


10、实际测试
ok


 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值