Keepalived双主模型中vrrp_script中权重改变故障排查-CSDN博客

故障重现

keepalived配置如下

 
          # vi /etc/keepalived/keepalived.conf 
         
          ! Configuration File  
          for  
          keepalived 
         
          global_defs { 
         
          notification_email { 
         
          root@localhost 
         
          } 
         
          notification_email_from admin@lnmmp.com 
         
          smtp_connect_timeout 3 
         
          smtp_server 127.0.0.1 
         
          router_id LVS_DEVEL 
         
          } 
         
          vrrp_script chk_maintaince_down { 
         
          script  
          "[[ -f /etc/keepalived/down ]] && exit 1 || exit 0" 
         
          interval 1 
         
          weight 2 
         
          } 
         
          vrrp_script chk_haproxy { 
         
          script  
          "killall -0 haproxy" 
         
          interval 1 
         
          weight 2 
         
          } 
         
          vrrp_instance VI_1 { 
         
          interface eth0 
         
          state MASTER 
         
          priority 100 
         
          virtual_router_id 125 
         
          garp_master_delay 1 
         
          authentication { 
         
          auth_type PASS 
         
          auth_pass 1e3459f77aba4ded 
         
          } 
         
          track_interface { 
         
          eth0 
         
          } 
         
          virtual_ipaddress { 
         
          172.16.25.10 
          /16  
          dev eth0 label eth0:0 
         
          } 
         
          track_script { 
         
          chk_haproxy 
         
          chk_maintaince_down 
         
          } 
         
          notify_master  
          "/etc/keepalived/notify.sh master 172.16.25.10" 
         
          notify_backup  
          "/etc/keepalived/notify.sh backup 172.16.25.10" 
         
          notify_fault  
          "/etc/keepalived/notify.sh fault 172.16.25.10" 
         
          } 
         
          vrrp_instance VI_2 { 
         
          interface eth0 
         
          state BACKUP 
         
          priority 99 
         
          virtual_router_id 126 
         
          garp_master_delay 1 
         
          authentication { 
         
          auth_type PASS 
         
          auth_pass 7615c4b7f518cede 
         
          } 
         
          track_interface { 
         
          eth0 
         
          } 
         
          virtual_ipaddress { 
         
          172.16.25.11 
          /16  
          dev eth0 label eth0:1 
         
          } 
         
          track_script { 
         
          chk_haproxy 
         
          chk_maintaince_down 
         
          } 
         
          notify_master  
          "/etc/keepalived/notify.sh master 172.16.25.11" 
         
          notify_backup  
          "/etc/keepalived/notify.sh backup 172.16.25.11" 
         
          notify_fault  
          "/etc/keepalived/notify.sh fault 172.16.25.11" 
         
          } 
         
          # vi /etc/keepalived/notify.sh 
         
          #!/bin/bash 
         
          # Author: Jason.Yu <admin@lnmmp.com> 
         
          # description: An example of notify script 
         
          # 
         
          contact= 
          'root@localhost' 
         
          notify() { 
         
          mailsubject= 
          "`hostname` to be $1: $2 floating" 
         
          mailbody= 
          "`date '+%F %H:%M:%S'`: vrrp transition, `hostname` changed to be $1" 
         
          echo  
          $mailbody | mail -s  
          "$mailsubject"  
          $contact 
         
          } 
         
          case  
          "$1"  
          in 
         
          master) 
         
          notify master $2 
         
          /etc/rc 
          .d 
          /init 
          .d 
          /haproxy  
          start 
         
          exit  
          0 
         
          ;; 
         
          backup) 
         
          notify backup $2 
         
          /etc/rc 
          .d 
          /init 
          .d 
          /haproxy  
          stop 
         
          exit  
          0 
         
          ;; 
         
          fault) 
         
          notify fault $2 
         
          /etc/rc 
          .d 
          /init 
          .d 
          /haproxy  
          stop 
         
          exit  
          0 
         
          ;; 
         
          *) 
         
          echo  
          'Usage: `basename $0` {master|backup|fault}' 
         
          exit  
          1 
         
          ;; 
         
          esac

引发的故障1：keepalived宕机恢复后VIP集体漂移故障

引发的故障2：haproxy服务停止后重启VIP集体漂移故障

原因

每次主备状态切换时，会引发notify_backup，而在notify.sh脚本中backup部分会执行/etc/rc.d/init.d/haproxy stop，导致权重在2个节点上都改变一次，从而单一节点上对于所有instance的权重都处于最大或者最小，故VIP集体漂移也就不奇怪了；

解决方法

修改notify.sh脚本，在处理backup部分，只发送通知邮件，而无需刻意停止haproxy服务；

 
          # vi /etc/keepalived/notify.sh 
         
          #!/bin/bash 
         
          # Author: Jason.Yu <admin@lnmmp.com> 
         
          # description: An example of notify script 
         
          # 
         
          contact= 
          'root@localhost' 
         
          notify() { 
         
          mailsubject= 
          "`hostname` to be $1: $2 floating" 
         
          mailbody= 
          "`date '+%F %H:%M:%S'`: vrrp transition, `hostname` changed to be $1" 
         
          echo  
          $mailbody | mail -s  
          "$mailsubject"  
          $contact 
         
          } 
         
          case  
          "$1"  
          in 
         
          master) 
         
          notify master $2 
         
          /etc/rc 
          .d 
          /init 
          .d 
          /haproxy  
          start 
         
          exit  
          0 
         
          ;; 
         
          backup) 
         
          notify backup $2 
         
          # /etc/rc.d/init.d/haproxy stop # 注释掉或删除此行 
         
          exit  
          0 
         
          ;; 
         
          fault) 
         
          notify fault $2 
         
          # /etc/rc.d/init.d/haproxy stop # 同上 
         
          exit  
          0 
         
          ;; 
         
          *) 
         
          echo  
          'Usage: `basename $0` {master|backup|fault}' 
         
          exit  
          1 
         
          ;; 
         
          esac

调整后的正常权重改变流程

vrrp_script中节点权重改变算法

vrrp_script 里的script返回值为0时认为检测成功，其它值都会当成检测失败；

weight 为正时，脚本检测成功时此weight会加到priority上，检测失败时不加；
1. 主失败:
  1. 主 priority < 从 priority + weight 时会切换。
2. 主成功：
  1. 主 priority + weight > 从 priority + weight 时，主依然为主
weight 为负时，脚本检测成功时此weight不影响priority，检测失败时priority – abs(weight)
1. 主失败:
  1. 主 priority – abs(weight) < 从priority 时会切换主从
2. 主成功:
  1. 主 priority > 从priority 主依然为主