故障重现
keepalived配置如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
# vi /etc/keepalived/keepalived.conf
! Configuration File
for
keepalived
global_defs {
notification_email {
root@localhost
}
notification_email_from admin@lnmmp.com
smtp_connect_timeout 3
smtp_server 127.0.0.1
router_id LVS_DEVEL
}
vrrp_script chk_maintaince_down {
script
"[[ -f /etc/keepalived/down ]] && exit 1 || exit 0"
interval 1
weight 2
}
vrrp_script chk_haproxy {
script
"killall -0 haproxy"
interval 1
weight 2
}
vrrp_instance VI_1 {
interface eth0
state MASTER
priority 100
virtual_router_id 125
garp_master_delay 1
authentication {
auth_type PASS
auth_pass 1e3459f77aba4ded
}
track_interface {
eth0
}
virtual_ipaddress {
172.16.25.10
/16
dev eth0 label eth0:0
}
track_script {
chk_haproxy
chk_maintaince_down
}
notify_master
"/etc/keepalived/notify.sh master 172.16.25.10"
notify_backup
"/etc/keepalived/notify.sh backup 172.16.25.10"
notify_fault
"/etc/keepalived/notify.sh fault 172.16.25.10"
}
vrrp_instance VI_2 {
interface eth0
state BACKUP
priority 99
virtual_router_id 126
garp_master_delay 1
authentication {
auth_type PASS
auth_pass 7615c4b7f518cede
}
track_interface {
eth0
}
virtual_ipaddress {
172.16.25.11
/16
dev eth0 label eth0:1
}
track_script {
chk_haproxy
chk_maintaince_down
}
notify_master
"/etc/keepalived/notify.sh master 172.16.25.11"
notify_backup
"/etc/keepalived/notify.sh backup 172.16.25.11"
notify_fault
"/etc/keepalived/notify.sh fault 172.16.25.11"
}
# vi /etc/keepalived/notify.sh
#!/bin/bash
# Author: Jason.Yu <admin@lnmmp.com>
# description: An example of notify script
#
contact=
'root@localhost'
notify() {
mailsubject=
"`hostname` to be $1: $2 floating"
mailbody=
"`date '+%F %H:%M:%S'`: vrrp transition, `hostname` changed to be $1"
echo
$mailbody | mail -s
"$mailsubject"
$contact
}
case
"$1"
in
master)
notify master $2
/etc/rc
.d
/init
.d
/haproxy
start
exit
0
;;
backup)
notify backup $2
/etc/rc
.d
/init
.d
/haproxy
stop
exit
0
;;
fault)
notify fault $2
/etc/rc
.d
/init
.d
/haproxy
stop
exit
0
;;
*)
echo
'Usage: `basename $0` {master|backup|fault}'
exit
1
;;
esac
|
引发的故障1:keepalived宕机恢复后VIP集体漂移故障
引发的故障2:haproxy服务停止后重启VIP集体漂移故障
原因
每次主备状态切换时,会引发notify_backup,而在notify.sh脚本中backup部分会执行/etc/rc.d/init.d/haproxy stop,导致权重在2个节点上都改变一次,从而单一节点上对于所有instance的权重都处于最大或者最小,故VIP集体漂移也就不奇怪了;
解决方法
修改notify.sh脚本,在处理backup部分,只发送通知邮件,而无需刻意停止haproxy服务;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
# vi /etc/keepalived/notify.sh
#!/bin/bash
# Author: Jason.Yu <admin@lnmmp.com>
# description: An example of notify script
#
contact=
'root@localhost'
notify() {
mailsubject=
"`hostname` to be $1: $2 floating"
mailbody=
"`date '+%F %H:%M:%S'`: vrrp transition, `hostname` changed to be $1"
echo
$mailbody | mail -s
"$mailsubject"
$contact
}
case
"$1"
in
master)
notify master $2
/etc/rc
.d
/init
.d
/haproxy
start
exit
0
;;
backup)
notify backup $2
# /etc/rc.d/init.d/haproxy stop # 注释掉或删除此行
exit
0
;;
fault)
notify fault $2
# /etc/rc.d/init.d/haproxy stop # 同上
exit
0
;;
*)
echo
'Usage: `basename $0` {master|backup|fault}'
exit
1
;;
esac
|
调整后的正常权重改变流程
vrrp_script中节点权重改变算法
vrrp_script 里的script返回值为0时认为检测成功,其它值都会当成检测失败;
-
weight 为正时,脚本检测成功时此weight会加到priority上,检测失败时不加;
-
主失败:
-
主 priority < 从 priority + weight 时会切换。
-
-
主成功:
-
主 priority + weight > 从 priority + weight 时,主依然为主
-
-
-
weight 为负时,脚本检测成功时此weight不影响priority,检测失败时priority – abs(weight)
-
主失败:
-
主 priority – abs(weight) < 从priority 时会切换主从
-
-
主成功:
-
主 priority > 从priority 主依然为主
-
-
-