通过nagios监控网络总出口流量异常时自动触发抓包行为20130515

最新推荐文章于 2024-05-16 02:51:42 发布

运维-Frank

最新推荐文章于 2024-05-16 02:51:42 发布

阅读量2.6k

点赞数

分类专栏：技术文档

本文链接：https://blog.csdn.net/xuyaqun/article/details/8941204

版权

技术文档专栏收录该内容

32 篇文章 1 订阅

订阅专栏

通过nagios监控总出口流量异常时触发抓包行为20130515

by lai

原理：nagios通过mrtg监控交换机上联端口流量，上联端口的in、out流量镜像到连接网管机的交换机端口，nagios检测到流量超出设定值，通过envent handler远程触发网管机的抓包脚本，并记录log，发送报警邮件，然后通过wireshark、tcpdump等分析抓到的数据包。

1、nagios安装mrtg，这里不详细介绍了
#yum install mrtg

设置mrtg每分钟取一次数据
# more /etc/cron.d/mrtg
*/1 * * * * root LANG=C LC_ALL=C /usr/bin/mrtg /etc/mrtg/mrtg.cfg --lock-file /var/lock/mrtg/mrtg_l --confcache-file /var/lib/mrtg/mrtg.ok

生成的mrtg图像在/var/www/mrtg/目录

2、定义命令
# more /usr/local/nagios/etc/objects/commands.cfg
###########################################################
# Define auto capture packets command
###########################################################
define command{
command_name auto-capture-packet-eh
#command_line /usr/local/nagios/libexec/eventhandlers/auto-capture-packet-eh.sh $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$ $HOSTADDRESS$ $HOSTDOWNTIME$ $SERVICEDOWNTIME$
command_line /usr/local/nagios/libexec/eventhandlers/auto-capture-packet-eh.sh $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$ 192.168.0.19 0 0
}

3、监控机定义交换机上联端口，设置in/out流量的报警值（交换机的端口ID需要通过snmpwalk获取）
# more /usr/local/nagios/etc/objects/wby_nagios/gateway_switch.cfg
# Define the switch that we'll be monitoring
define host{
        use             generic-switch          ; Inherit default values from a template
        host_name       gateway_switch_h3c254           ; The name we're giving to this switch
        alias           gateway_switch_h3c254   ; A longer name associated with the switch
        address         192.168.0.254           ; IP address of the switch
        hostgroups      switches                ; Host groups this switch is associated with
        }

# Monitor bandwidth via MRTG logs
define service{
        use                     generic-service ; Inherit values from a template
        host_name               gateway_switch_h3c254
        service_description     Port 24 uplink_port Bandwidth Usage
        check_command           check_local_mrtgtraf!/var/www/mrtg/192.168.0.254_3462.log!AVG!2000000,2000000!3000000,3000000!3
        max_check_attempts      4
        event_handler           auto-capture-packet-eh
        }

4、把交换机上联端口的所有流量镜像到连接网管机的网卡的端口

5、设置从nagios监控机到抓包网管机的ssh信任，以便在抓包条件触发时，监控机不需要输入密码即可执行网管机上的抓包脚本（监控机nagios-->网管机nagios）
网管机nagios设置ssh信任
useradd nagios
mkdir /home/nagios/.ssh/
vim /home/nagios/.ssh/authorized_keys
chmod 700 /home/nagios/.ssh/
chown nagios.nagios /home/nagios/ -R

6、网管机nagios用户增加sudo 权限
# more /etc/sudoers
# Nagios commands
Cmnd_Alias SERVICE=/etc/init.d/sendmail
User_Alias NAGIOSUSERS = nagios
NAGIOSUSERS ALL = NOPASSWD: SERVICE
nagios ALL=(root) NOPASSWD:/usr/local/nagios/libexec/eventhandlers/auto-capture-packet.sh

如果报错：sudo: sorry, you must have a tty to run sudo
需要注释掉/etc/sudoers requiretty行
#Defaults requiretty

7、nagiso监控机上的事件控制脚本，需要可执行权限
# more /usr/local/nagios/libexec/eventhandlers/auto-capture-packet-eh.sh
#!/bin/sh
#
# Event handler script for restarting the apache (httpd) on the remote machine
#
# Matthew Harman May 2012 - matthew@harmanweb.co.uk
#
# Note: This script will only restart if the service is
# retried 3 times (in a "soft" state) or if the service somehow
# manages to fall into a "hard" error state.
#
# What state is the httpd check in?
case "$1" in
   OK)
      # The service just came back up, so don't do anything...
      ;;
   WARNING)
      # We don't really care about warning states, since the service is
      # probably still running...
      ;;
   UNKNOWN)
      # We don't know what might be causing an unknown error, so don't do
      # anything...
      ;;
   CRITICAL)
      # Aha! The service appears to have a problem - perhaps we should
      # restart the server...
      # Is this a "soft" or a "hard" state?
      case "$2" in
         # We're in a "soft" state, meaning that Nagios is in the middle
         # of retrying the check before it turns into a "hard" state and
         # contacts get notified...
         SOFT)
            # What check attempt are we on? We don't want to
            # restart the web server on the first check, because
            # it may just be a fluke!
            case "$3" in
               # Wait until the check has been tried 3 times
               # before restarting the web server. If the
               # check fails on the 4th time (after we restart
               # the web server), the state type will turn to
               # "hard" and contacts will be notified of the
               # problem.
               # Hopefully this will restart the web server
               # successfully, so the 4th check will result
               # in a "soft" recovery. If that happens no one
               # gets notified because we fixed the problem!
               3)
                  # Check if the host or service is in a
                  # period of downtime
                  servicestatus="$5""$6";
                  case "$servicestatus" in
                     00)
                        echo -n "Restarting service (3rd soft critical state)..."
                        # Call the script to restart the process
                        ssh -f -T "$4" /usr/local/nagios/libexec/eventhandlers/auto-capture-packet1.sh &
                        echo "总出口流量异常，已连续3分钟超过警戒值，系统已自动抓包保存，请检查"|mutt -s "Waring:总出口流量异常$(date +%Y%m%d%H%M)" 362560**@q
q.com,lai***@126.com
                        #sleep 20
                        #kill -9 `ps -ef |grep ssh|grep "$4"|grep "remount-httpd-eh.sh"|awk -F" " '{print $2}'`
                        #echo `date` >>/home/nagios/tcpdump.log
                        #/usr/bin/sudo /usr/sbin/tcpdump -c 100 -w /home/nagios/$(date +%Y%m%d%H%M)_tcpdump.cap
                        ;;
                  esac
                  ;;
            esac
            ;;
            # The service somehow managed to turn into a hard error
            # without getting fixed. It should have been restarted by the
            # code above, but for some reason it didn't. Lets give it one
            # last try, shall we?
            # Note: Contacts have already been notified of a problem with
            # the service at this point (unless you disabled notifications
            # for this service)
         HARD)
            # Check if the host or service is in a period of downtime
            servicestatus="$5""$6";
            case "$servicestatus" in
               00)
                  echo -n "Restarting Service..."
                  # Call the script to restart the server
                  ssh -f -T "$4" /usr/local/nagios/libexec/eventhandlers/auto-capture-packet2.sh &
                  echo "总出口流量异常，已连续4分钟超过警戒值，系统已自动抓包保存，请检查"|mutt -s "Critical:总出口流量异常$(date +%Y%m%d%H%M)" 362560**@qq.co
m,lai***@126.com
                  #sleep 20
                  #kill -9 `ps -ef |grep ssh|grep "$4"|grep "remount-httpd-eh.sh"|awk -F" " '{print $2}'`
                  #echo `date +%Y%m%d%H%M` >>/home/nagios/tcpdump.log
                  #/usr/bin/sudo /usr/sbin/tcpdump -c 100 -w /home/nagios/tcpdump_2.cap
                  #ssh -f -T "$4"       /usr/bin/sudo /usr/sbin/tcpdump -c 100 -w /home/nagios/$(date +%Y%m%d%H%M)_tcpdump_2.cap
                  ;;
            esac
            ;;
      esac
      ;;
esac
exit 0

8、网管机上的执行抓包的脚本，需要可执行权限
# more /usr/local/nagios/libexec/eventhandlers/auto-capture-packet.sh
#!/bin/sh
#sudo /etc/init.d/sendmail restart
echo -n "tcpdump start at:"`date`" " >>/home/nagios/log.txt
/usr/bin/sudo /usr/sbin/tcpdump -i eth1 -c 2000000 -w /home/nagios/$(date +%Y%m%d%H%M)_tcpdump_eth1.cap
/bin/gzip /home/nagios/$(date +%Y%m%d%H%M)_tcpdump_eth1.cap
echo "end at:"`date` >>/home/nagios/log.txt

9、模拟测试：
/usr/local/nagios/libexec/eventhandlers/auto-capture-packet-eh.sh CRITICAL HARD 0 192.168.0.19 0 0

10、实际测试
ok