1 LVS(DR)
DR模型中各主机上均需要配置VIP,解决地址冲突的方式有三种:
(1) 在前端网关做静态绑定
(2) 在各RS使用arptables
(3) 在各RS修改内核参数,来限制arp响应和通告的级别
限制响应级别:arp_ignore
-
0:默认值,表示可使用本地任意接口上配置的任意地址进行响应
-
1:仅在请求的目标IP配置在本地主机的接收到请求报文的接口上时,才给予响应
限制通告级别:arp_announce
-
0:默认值,把本机所有接口的所有信息向每个接口的网络进行通告
-
1:尽量避免将接口信息向非直接连接网络进行通告
-
2:必须避免将接口信息向非本网络进行通告
配置要点
-
Director 服务器采用双IP桥接网络,一个是 VIP,一个 DIP
-
Web 服务器采用和 DIP 相同的网段和 Director 连接
-
每个 Web 服务器配置VIP
-
每个 web 服务器可以应答client的请求
-
每个 web 服务器的网关不能指向 DIP
LVS(DR)架构图
Router 设置
# 需要开启路由转发功能,实际生产中使用真实的路由器则不需此设置
[root@Router ~]# grep net.ipv4.ip_forward /etc/sysctl.conf
net.ipv4.ip_forward = 1
[root@Router ~]# sysctl -p
net.ipv4.ip_forward = 1
[root@Router ~]#
LVS 设置
# 在LVS主机运行的脚本
# 注意:VIP如果配置在LO网卡上,必须使用32bit子网掩码
# 如果VIP绑定在eth0上,可以使用其它netmask
[root@LVS ~]# cat Set_Lvs.sh
#!/bin/bash
#
VIP="192.168.60.60"
PORT="80"
RS1="192.168.60.100"
RS2="192.168.60.200"
NET_INTERFACE="lo:1"
NETMASK=32
MODE="-g"
SCHEDULER="wrr"
Lo_Addr=$(ifconfig lo:1|awk '/inet/{print $2}')
rpm -q ipvsadm &> /dev/null || yum -y install ipvsadm &> /dev/null
case $1 in
start|START|up|UP)
ifconfig ${NET_INTERFACE} ${VIP}/${NETMASK}
iptables -F
ipvsadm -A -t ${VIP}:${PORT} -s ${SCHEDULER}
ipvsadm -a -t ${VIP}:${PORT} -r ${RS1} ${MODE} -w 1
ipvsadm -a -t ${VIP}:${PORT} -r ${RS2} ${MODE} -w 1
echo -e "\033[1;33mThe LVS Server is Ready!\033[0m"
;;
stop|STOP|down|DOWN)
if [[ "$VIP" == "$Lo_Addr" ]];then
ifconfig ${NET_INTERFACE} down
ipvsadm -C
echo -e "\033[1;31mThe LVS Server is Canceled!\033[0m"
else
echo -e "\033[1;31mvip:$VIP address not exist,don't stop!\033[0m"
exit 1
fi
;;
*)
echo -e "\033[1;32mUsage: $(basename $0) start|START|up|UP|stop|STOP|down|DOWN\033[0m"
exit 1
;;
esac
[root@LVS ~]#
RealServer 设置
注意:两台 RealServer 都要运行此脚本
root@RS2:~# cat Set_RealServer.sh
#!/bin/bash
#
VIP="192.168.60.60"
NET_INTERFACE="lo:1"
NETMASK=32
SET_ARP="/proc/sys/net/ipv4/conf"
Lo_Addr=$(ifconfig lo:1|awk '/inet/{print $2}')
case $1 in
start|START|up|UP)
ifconfig ${NET_INTERFACE} ${VIP}/${NETMASK}
echo 1 > ${SET_ARP}/all/arp_ignore
echo 2 > ${SET_ARP}/all/arp_announce
echo 1 > ${SET_ARP}/lo/arp_ignore
echo 2 > ${SET_ARP}/lo/arp_announce
echo -e "\033[1;33mThe RealServer is Ready!\033[0m"
;;
stop|STOP|down|DOWN)
echo 0 > ${SET_ARP}/all/arp_ignore
echo 0 > ${SET_ARP}/all/arp_announce
echo 0 > ${SET_ARP}/lo/arp_ignore
echo 0 > ${SET_ARP}/lo/arp_announce
if [[ "$VIP" == "$Lo_Addr" ]];then
ifconfig ${NET_INTERFACE} down
echo -e "\033[1;31mThe LVS Server is Canceled!\033[0m"
else
echo -e "\033[1;31mvip:$VIP address not exist,don't stop!\033[0m"
exit 1
fi
;;
*)
echo -e "\033[1;32mUsage: $(basename $0) start|START|up|UP|stop|STOP|down|DOWN\033[0m"
exit 1
;;
esac
root@RS2:~#
RealServer1 安装配置 nginx
服务
root@RS1:~# apt install nginx
root@RS1:~# cat /etc/nginx/conf.d/pc.conf
server {
listen 80;
server_name localhost;
location /{
root /data/nginx/pc;
index index.html;
}
}
root@RS1:~# mkdir -p /data/nginx/pc
root@RS1:~# echo "<h1>RS1 192.168.60.100</h1>" > /data/nginx/pc/index.html
root@RS1:~# cat /data/nginx/pc/index.html
<h1>RS1 192.168.60.100</h1>
root@RS1:~# systemctl enable --now nginx
RealServer2 安装配置 nginx
服务
root@RS2:~# apt install nginx
root@RS2:~# cat /etc/nginx/conf.d/pc.conf
server {
listen 80;
server_name localhost;
location /{
root /data/nginx/pc;
index index.html;
}
}
root@RS2:~# mkdir -p /data/nginx/pc
root@RS2:~# echo "<h1>RS2 192.168.60.200</h1>" > /data/nginx/pc/index.html
root@RS2:~# cat /data/nginx/pc/index.html
<h1>RS2 192.168.60.200</h1>
root@RS2:~# systemctl enable --now nginx
因为 LVS-DR
模式是通过为请求报文重新封装一个 MAC
首部进行转发,不会修改请求和应答 IP
,所以在 Realserver
上我们是可以看到客户端(Client)的请求 IP
# Client IP
[root@client ~]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
link/ether 00:50:56:82:e0:18 brd ff:ff:ff:ff:ff:ff
inet 172.18.8.17/16 brd 172.18.255.255 scope global noprefixroute eth0
valid_lft forever preferred_lft forever
inet6 fe80::250:56ff:fe82:e018/64 scope link noprefixroute
valid_lft forever preferred_lft forever
[root@client ~]# curl 192.168.60.60
<h1>RS2 192.168.60.200</h1>
[root@client ~]# curl 192.168.60.60
<h1>RS1 192.168.60.100</h1>
[root@client ~]# curl 192.168.60.60
<h1>RS2 192.168.60.200</h1>
[root@client ~]# curl 192.168.60.60
<h1>RS1 192.168.60.100</h1>
[root@client ~]#
# 在 Realserver 上查看请求日志
root@RS2:/etc/nginx# tail -fn5 /var/log/nginx/access.log
172.18.8.17 - - [06/Oct/2022:10:06:28 +0000] "GET / HTTP/1.1" 200 28 "-" "curl/7.29.0"
172.18.8.17 - - [06/Oct/2022:10:06:57 +0000] "GET / HTTP/1.1" 200 28 "-" "curl/7.29.0"
172.18.8.17 - - [06/Oct/2022:10:09:16 +0000] "GET / HTTP/1.1" 200 28 "-" "curl/7.29.0"
172.18.8.17 - - [06/Oct/2022:10:09:18 +0000] "GET / HTTP/1.1" 200 28 "-" "curl/7.29.0"
172.18.8.17 - - [06/Oct/2022:10:17:04 +0000] "GET / HTTP/1.1" 200 28 "-" "curl/7.29.0"
我们通过使用 tcpdump
抓包,结合 wireshark
解读 LVS-DR
的三次握手过程
# 在 RealServer 上抓包
root@RS2:~# tcpdump -i eth1 -nn port 80 and host 172.18.8.17 -w lvs_dr.pcap
tcpdump: listening on eth1, link-type EN10MB (Ethernet), capture size 262144 bytes
^C10 packets captured
10 packets received by filter
0 packets dropped by kernel
root@RS2:~# du -sh lvs_dr.pcap
4.0K lvs_dr.pcap
将 lvs_dr.pcap
导入 wireshark
上图展示的是 Client
和 RealServer
建立握手的过程,你可能会有疑问,LVS
和 RealServer
都配置了 VIP
,你如何确认是和 RealServer
建立连接,而不是和 lVS
?
那是因为 LVS
是运行在此处就相当于一个路由器,不参与握手
如图:
可以通过 IP + MAC
来确认,握手的是 RealServer
,而不是 LVS
查看网络连接
我们可以使用 dd
命令来生成一个大文件,在客户端通过 wget
来下载这个文件,可以确认 Client
是直接与 RealServer
建立连接
RealServer1(192.168.60.100)
root@RS1:~# dd if=/dev/zero of=/data/nginx/pc/testfile bs=1M count=100
100+0 records in
100+0 records out
104857600 bytes (105 MB, 100 MiB) copied, 0.217004 s, 483 MB/s
root@RS1:~# ll /data/nginx/pc/
total 102412
drwxr-xr-x 2 root root 4096 Oct 13 19:26 ./
drwxr-xr-x 4 root root 4096 Jun 19 2021 ../
-rw-r--r-- 1 root root 28 Jun 19 2021 index.html
-rw-r--r-- 1 root root 104857600 Oct 13 19:26 testfile
root@RS1:~#
客户端(172.18.8.17)
[root@client ~]# wget --limit-rate 10k http://192.168.60.60/testfile
--2021-07-01 18:40:54-- http://192.168.60.60/testfile
正在连接 192.168.60.60:80... 已连接。
已发出 HTTP 请求,正在等待回应... 200 OK
长度:104857600 (100M) [application/octet-stream]
正在保存至: “testfile.1”
36% [======================> ] 38,035,456 10.0KB/s 剩余 1h 48m
# 另开一个 Client 窗口查看
[root@client ~]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
link/ether 00:50:56:82:e0:18 brd ff:ff:ff:ff:ff:ff
inet 172.18.8.17/16 brd 172.18.255.255 scope global noprefixroute eth0
valid_lft forever preferred_lft forever
inet6 fe80::250:56ff:fe82:e018/64 scope link noprefixroute
valid_lft forever preferred_lft forever
[root@client ~]# ss -ant
State Recv-Q Send-Q Local Address:Port Peer Address:Port
LISTEN 0 128 *:22 *:*
LISTEN 0 100 127.0.0.1:25 *:*
ESTAB 0 0 172.18.8.17:22 172.18.60.171:49776
ESTAB 252040 0 172.18.8.17:38434 192.168.60.60:80 # CIP 和 VIP 的连接
LISTEN 0 128 [::]:22 [::]:*
LISTEN 0 100 [::1]:25 [::]:*
[root@client ~]#
RealServer1(192.168.60.100)的网络连接
root@RS1:~# ss -nt
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 0 192.168.60.100:22 172.18.60.171:50015
ESTAB 0 232104 192.168.60.60:80 172.18.8.17:38434 # VIP 和 CIP 的连接
SYN-SENT 0 1 192.168.60.100:42240 223.5.5.5:53
root@RS1:~#
因为没有调度到 RealServer2(192.168.60.200)
上,所以没有和 Client
的网络连接
root@RS2:~# ss -nt
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 0 192.168.60.200:22 172.18.60.171:50238
SYN-SENT 0 1 192.168.60.200:43766 180.76.76.76:53
root@RS2:~#
2 LVS(DR)+ Keepalive
在上面的架构中,我们的业务服务器(RealServer)做到了负载均衡和高可用
即,我们 DOWN
掉 RS1
后,LVS
不会再往 RS1
调度了
# Down 掉 RS1 的 Nginx 服务
root@RS1:~# systemctl stop nginx
root@RS1:~# ps -ef|grep nginx
root 15893 15514 0 16:39 pts/0 00:00:00 grep --color=auto nginx
root@RS1:~#
# 客户端访问测试
[root@client ~]# curl 192.168.60.60
<h1>RS2 192.168.60.200</h1>
[root@client ~]# curl 192.168.60.60
<h1>RS2 192.168.60.200</h1>
[root@client ~]# curl 192.168.60.60
<h1>RS2 192.168.60.200</h1>
[root@client ~]#
启动 RS1
的 Nginx
后,又做到了轮询
# 启动 RS1 的 Nginx 服务
root@RS1:~# systemctl start nginx
root@RS1:~# ps -ef|grep nginx |grep -v grep
root 15908 1 0 16:41 ? 00:00:00 nginx: master process /usr/sbin/nginx -g daemon on; master_process on;
www-data 15910 15908 0 16:41 ? 00:00:00 nginx: worker process
www-data 15912 15908 0 16:41 ? 00:00:00 nginx: worker process
www-data 15914 15908 0 16:41 ? 00:00:00 nginx: worker process
www-data 15916 15908 0 16:41 ? 00:00:00 nginx: worker process
root@RS1:~#
# 客户端访问测试
[root@client ~]# curl 192.168.60.60
<h1>RS1 192.168.60.100</h1>
[root@client ~]# curl 192.168.60.60
<h1>RS2 192.168.60.200</h1>
[root@client ~]# curl 192.168.60.60
<h1>RS1 192.168.60.100</h1>
[root@client ~]# curl 192.168.60.60
<h1>RS2 192.168.60.200</h1>
[root@client ~]#
但是,如果我们的 LVS
机器 DOWN
掉之后呢?
很显然,我们的业务将彻底无法访问
那么,有没有办法解决 LVS
的单点问题呢?
答案就是引入 Keepalive
高可用服务
架构如下
Keepalive设置
我们在部署一台 LVS
服务器,并且将 Keepalive
和 LVS
部署在一起
# LVS-1 配置
[root@LVS-1 ~]# dnf -y install keepalived
[root@LVS-1 ~]# cat /etc/keepalived/keepalived.conf
! Configuration File for keepalived
global_defs {
notification_email {
root@localhost
}
notification_email_from keepalived@localhost
smtp_server 127.0.0.1
smtp_connect_timeout 30
router_id lvs1
vrrp_mcast_group4 224.0.100.10
}
vrrp_instance VI_1 {
state MASTER
interface eth1
virtual_router_id 66
priority 100
advert_int 1
authentication {
auth_type PASS
auth_pass 123456
}
virtual_ipaddress {
#192.168.60.60 dev lo label lo:1
192.168.60.60/24 dev eth1 label eth1:1
}
notify_master "/etc/keepalived/notify.sh master"
notify_backup "/etc/keepalived/notify.sh backup"
notify_fault "/etc/keepalived/notify.sh fault"
}
virtual_server 192.168.60.60 80 {
delay_loop 3
lb_algo rr
lb_kind DR
protocol TCP
sorry_server 127.0.0.1 80
real_server 192.168.60.100 80 {
weight 1
HTTP_GET {
url {
path /
status_code 200
}
connect_timeout 1
nb_get_retry 3
delay_before_retry 1
}
}
real_server 192.168.60.200 80 {
weight 1
TCP_CHECK {
connect_timeout 5
nb_get_retry 3
delay_before_retry 3
connect_port 80
}
}
}
[root@LVS-1 ~]# systemctl start keepalived
# LVS-2 配置
[root@LVS-2 ~]# dnf -y install keepalived
[root@LVS-2 ~]# cat /etc/keepalived/keepalived.conf
! Configuration File for keepalived
global_defs {
notification_email {
root@localhost
}
notification_email_from keepalived@localhost
smtp_server 127.0.0.1
smtp_connect_timeout 30
router_id lvs2
vrrp_mcast_group4 224.0.100.10
}
vrrp_instance VI_1 {
state BACKUP
interface eth1
virtual_router_id 66
priority 80
advert_int 1
authentication {
auth_type PASS
auth_pass 123456
}
virtual_ipaddress {
192.168.60.60/24 dev eth1 label eth1:1
}
notify_master "/etc/keepalived/notify.sh master"
notify_backup "/etc/keepalived/notify.sh backup"
notify_fault "/etc/keepalived/notify.sh fault"
}
virtual_server 192.168.60.60 80 {
delay_loop 3
lb_algo rr
lb_kind DR
protocol TCP
sorry_server 127.0.0.1 80
real_server 192.168.60.100 80 {
weight 1
HTTP_GET {
url {
path /
status_code 200
}
connect_timeout 1
nb_get_retry 3
delay_before_retry 1
}
}
real_server 192.168.60.200 80 {
weight 1
TCP_CHECK {
connect_timeout 5
nb_get_retry 3
delay_before_retry 3
connect_port 80
}
}
}
[root@LVS-2 ~]# systemctl start keepalived
设置完 lvs + keepalive
如果没有设置过 RealServer
的话,我们还需要做如下设置
RealServer 设置
注意:两台 RealServer 都要运行此脚本
root@RS2:~# cat Set_RealServer.sh
#!/bin/bash
#
VIP="192.168.60.60"
NET_INTERFACE="lo:1"
NETMASK=32
SET_ARP="/proc/sys/net/ipv4/conf"
Lo_Addr=$(ifconfig lo:1|awk '/inet/{print $2}')
case $1 in
start|START|up|UP)
ifconfig ${NET_INTERFACE} ${VIP}/${NETMASK}
echo 1 > ${SET_ARP}/all/arp_ignore
echo 2 > ${SET_ARP}/all/arp_announce
echo 1 > ${SET_ARP}/lo/arp_ignore
echo 2 > ${SET_ARP}/lo/arp_announce
echo -e "\033[1;33mThe RealServer is Ready!\033[0m"
;;
stop|STOP|down|DOWN)
echo 0 > ${SET_ARP}/all/arp_ignore
echo 0 > ${SET_ARP}/all/arp_announce
echo 0 > ${SET_ARP}/lo/arp_ignore
echo 0 > ${SET_ARP}/lo/arp_announce
if [[ "$VIP" == "$Lo_Addr" ]];then
ifconfig ${NET_INTERFACE} down
echo -e "\033[1;31mThe LVS Server is Canceled!\033[0m"
else
echo -e "\033[1;31mvip:$VIP address not exist,don't stop!\033[0m"
exit 1
fi
;;
*)
echo -e "\033[1;32mUsage: $(basename $0) start|START|up|UP|stop|STOP|down|DOWN\033[0m"
exit 1
;;
esac
root@RS2:~#
至此,设置完成
因为 LVS-1
是主机(优先级是 100,高于 LVS-2)我们可以登录 LVS-1
查看 VIP
绑定以及 lvs-dr
规则
# 查看 VIP:192.168.60.60 是否存在
[root@LVS-1 ~]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN group default qlen 1000
link/ether 00:50:56:82:98:c3 brd ff:ff:ff:ff:ff:ff
3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
link/ether 00:50:56:a3:e8:d2 brd ff:ff:ff:ff:ff:ff
inet 192.168.60.80/24 brd 192.168.60.255 scope global noprefixroute eth1
valid_lft forever preferred_lft forever
inet 192.168.60.60/32 scope global eth1:1
valid_lft forever preferred_lft forever
inet6 fe80::6c06:303c:a126:d35b/64 scope link noprefixroute
valid_lft forever preferred_lft forever
[root@LVS-1 ~]#
# 查看 lvs-dr 规则
[root@LVS-1 ~]# ipvsadm -Ln
IP Virtual Server version 1.2.1 (size=4096)
Prot LocalAddress:Port Scheduler Flags
-> RemoteAddress:Port Forward Weight ActiveConn InActConn
TCP 192.168.60.60:80 rr
-> 192.168.60.100:80 Route 1 0 0
-> 192.168.60.200:80 Route 1 0 0
[root@LVS-1 ~]#
此时 LVS-2
没有 VIP
,虽说有 lvs-dr
规则,但并不会生效
[root@LVS-2 ~]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN group default qlen 1000
link/ether 00:50:56:a3:07:c1 brd ff:ff:ff:ff:ff:ff
3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
link/ether 00:50:56:a3:4c:74 brd ff:ff:ff:ff:ff:ff
inet 192.168.60.88/24 brd 192.168.60.255 scope global noprefixroute eth1
valid_lft forever preferred_lft forever
inet6 fe80::5745:92ae:e725:b669/64 scope link noprefixroute
valid_lft forever preferred_lft forever
[root@LVS-2 ~]# ipvsadm -Ln
IP Virtual Server version 1.2.1 (size=4096)
Prot LocalAddress:Port Scheduler Flags
-> RemoteAddress:Port Forward Weight ActiveConn InActConn
TCP 192.168.60.60:80 rr
-> 192.168.60.100:80 Route 1 0 0
-> 192.168.60.200:80 Route 1 0 0
[root@LVS-2 ~]#
如果 LVS-1
放生故障,则 VIP
会飘向 LVS-2
并且 lvs-dr
规则生效
# LVS-1 操作
[root@LVS-1 ~]# systemctl stop keepalived
[root@LVS-1 ~]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN group default qlen 1000
link/ether 00:50:56:82:98:c3 brd ff:ff:ff:ff:ff:ff
3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
link/ether 00:50:56:a3:e8:d2 brd ff:ff:ff:ff:ff:ff
inet 192.168.60.80/24 brd 192.168.60.255 scope global noprefixroute eth1
valid_lft forever preferred_lft forever
inet6 fe80::6c06:303c:a126:d35b/64 scope link noprefixroute
valid_lft forever preferred_lft forever
[root@LVS-1 ~]# ipvsadm -Ln
IP Virtual Server version 1.2.1 (size=4096)
Prot LocalAddress:Port Scheduler Flags
-> RemoteAddress:Port Forward Weight ActiveConn InActConn
[root@LVS-1 ~]#
# 查看 LVS-2(确认 VIP:192.168.60.60已存在)
[root@LVS-2 ~]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN group default qlen 1000
link/ether 00:50:56:a3:07:c1 brd ff:ff:ff:ff:ff:ff
3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
link/ether 00:50:56:a3:4c:74 brd ff:ff:ff:ff:ff:ff
inet 192.168.60.88/24 brd 192.168.60.255 scope global noprefixroute eth1
valid_lft forever preferred_lft forever
inet 192.168.60.60/24 scope global secondary eth1:1
valid_lft forever preferred_lft forever
inet6 fe80::5745:92ae:e725:b669/64 scope link noprefixroute
valid_lft forever preferred_lft forever
[root@LVS-2 ~]# ipvsadm -Ln
IP Virtual Server version 1.2.1 (size=4096)
Prot LocalAddress:Port Scheduler Flags
-> RemoteAddress:Port Forward Weight ActiveConn InActConn
TCP 192.168.60.60:80 rr
-> 192.168.60.100:80 Route 1 0 0
-> 192.168.60.200:80 Route 1 0 0
[root@LVS-2 ~]#
因为 Keepalive
默认采用的是抢占式,如果LVS-1
服务器恢复后 ,启动 Keepalived
服务,VIP
会被抢走,此时将机器由 LVS-1
提供服务
3 Zabbix 监控 Keepalive 脑裂
我们在生产中使用了一段时间的 Keepalive
,发现了一个问题,就是出现了两台 Keepalive
上都有 VIP
的存在,即我们经常所说的“脑裂”问题,我们排查之后发现,这样被动的发现问题的方法很蠢,所以领导要求我们主动出击,第一时间发现问题,避免对线上业务造成不可估量的损失,为此,我们发挥主观能动性,引入 Zabbix
监控
3.1 Zabbix Server 的安装
注:Zabbix Server
和 Route
部署在一台机器上,IP:172.18.8.18
3.2 Zabbix Agent 安装
在两台 LVS + Keepalive
机器上部署 zabbix_agent
# 我在编译安装 zabbix_server 的时候,也将 agent 启用了
# 所以,直接将在 zabbix_server 编译好的 agent 拷贝过来就可以使用
# 或者你直接 yum 安装
# 或者单独编译 zabbix_agentd
[root@LVS-1 ~]# mkdir -p /apps
[root@LVS-1 ~]# cd /apps/
[root@LVS-1 apps]# scp -r 172.18.8.18:/apps/zabbix ./
root@172.18.8.18's password:
zabbix_agentd 100% 2265KB 46.4MB/s 00:00
zabbix_server 100% 14MB 57.1MB/s 00:00
zabbix_server.conf 100% 24KB 245.3KB/s 00:00
zabbix_agentd.conf 100% 15KB 3.9MB/s 00:00
zabbix_get 100% 1090KB 9.0MB/s 00:00
zabbix_sender 100% 1127KB 75.2MB/s 00:00
zabbix_js 100% 4056KB 92.3MB/s 00:00
zabbix_get.1 100% 4929 1.0MB/s 00:00
zabbix_sender.1 100% 14KB 4.1MB/s 00:00
zabbix_agentd.8 100% 3927 46.7KB/s 00:00
zabbix_server.8 100% 3775 2.8MB/s 00:00
zabbix_server.log 100% 365KB 11.2MB/s 00:00
zabbix_agentd.log 100% 212KB 10.0MB/s 00:00
zabbix_server.pid 100% 5 4.0KB/s 00:00
zabbix_agentd.pid 100% 5 4.9KB/s 00:00
[root@LVS-1 apps]# scp 172.18.8.18:/usr/lib/systemd/system/zabbix_agent.service /usr/lib/systemd/system/
root@172.18.8.18's password:
zabbix_agent.service 100% 391 23.0KB/s 00:00
[root@LVS-1 apps]#
启动 zabbix_agent
[root@LVS-1 ~]# groupadd --system zabbix
[root@LVS-1 ~]# useradd --system -g zabbix -d /usr/lib/zabbix -s /sbin/nologin -c "Zabbix Monitoring System" zabbix
[root@LVS-1 ~]# chown -R zabbix.zabbix /apps/zabbix/
[root@LVS-1 ~]# hostName=$(hostname -I|awk '{print $1}')
[root@LVS-1 ~]# sed -i '/^Hostname=Zabbix Agent/c Hostname='${hostName}'' /apps/zabbix/etc/zabbix_agentd.conf
[root@LVS-1 ~]# sed -i '/^Server=127.0.0.1/c Server=172.18.8.18,192.168.60.1' /apps/zabbix/etc/zabbix_agentd.conf
[root@LVS-1 ~]# sed -i '/^ServerActive=127.0.0.1/c ServerActive=172.18.8.18,192.168.60.1' /apps/zabbix/etc/zabbix_agentd.conf
[root@LVS-1 ~]# systemctl daemon-reload
[root@LVS-1 ~]# systemctl start zabbix_agent
[root@LVS-1 ~]# ps -ef|grep zabbix|grep -v grep
zabbix 6625 1 0 23:03 ? 00:00:00 /apps/zabbix/sbin/zabbix_agentd -c /apps/zabbix/etc/zabbix_agentd.conf
zabbix 6626 6625 0 23:03 ? 00:00:00 /apps/zabbix/sbin/zabbix_agentd: collector [idle 1 sec]
zabbix 6627 6625 0 23:03 ? 00:00:00 /apps/zabbix/sbin/zabbix_agentd: listener #1 [waiting for connection]
zabbix 6628 6625 0 23:03 ? 00:00:00 /apps/zabbix/sbin/zabbix_agentd: listener #2 [waiting for connection]
zabbix 6629 6625 0 23:03 ? 00:00:00 /apps/zabbix/sbin/zabbix_agentd: listener #3 [waiting for connection]
zabbix 6630 6625 0 23:03 ? 00:00:00 /apps/zabbix/sbin/zabbix_agentd: active checks #1 [idle 1 sec]
[root@LVS-1 ~]#
3.3 将 agent 加入到 Zabbix Server 的设置步骤如下
添加hosts
添加监控模板
3.4 在 Zabbix 中添加监控项和报警设置
自定义报警的key
[root@Router ~]# cat /apps/zabbix/etc/zabbix_agentd.conf.d/check_keepalived.conf
UserParameter=check_keepalived[*],/bin/bash /apps/zabbix/etc/zabbix_agentd.conf.d/check_vip.sh
自定义监控脚本
备机(LVS-2)上存在VIP
有两种情况
第一种:主机(LVS-1)真实发生了宕机或者服务故障,导致VIP
飘到了备机(LVS-2)上
第二种:因为KeepAlive
配置问题或防火墙(iptalbes)等原因导致主机和备机之间的心跳链路通信出现了故障,即,Keepalive
产生了“脑裂”
网上有些方法是监控备机出现了VIP
的情况就判断发生了“脑裂”,这种方法是容易产生误报的
我们要监控的其实就只有第二种情况,此时我们将脚本部署在第三台中立的机器上,采用arping
命令,如果解析出两个MAC
,即,确认产生了“脑裂”。
[root@Router ~]# cat /apps/zabbix/etc/zabbix_agentd.conf.d/check_vip.sh
#!/bin/bash
#
vip="192.168.60.60"
net="eth0"
checkMac=$(arping -c 1 -I $net $vip|awk -F"[][]" '/Unicast/{print $2}'|wc -l)
if [[ $checkMac -gt 1 ]]
then
result=1
else
result=0
fi
echo $result
[root@Router ~]#
添加监控项
报警阈值设置
3.5 模拟 keepalive
“脑裂”
在备机(LVS-2)的机器上设置防火墙,拒绝接收主机(LVS-1)的数据包,VIP(192.168.60.60)
飘到了 LVS-2
上,产生了“脑裂”
[root@LVS-2 ~]# iptables -t filter -A INPUT -s 192.168.60.80 -j DROP
[root@LVS-2 ~]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST> mtu 1500 qdisc mq state DOWN group default qlen 1000
link/ether 00:50:56:a3:07:c1 brd ff:ff:ff:ff:ff:ff
3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
link/ether 00:50:56:a3:4c:74 brd ff:ff:ff:ff:ff:ff
inet 192.168.60.88/24 brd 192.168.60.255 scope global noprefixroute eth1
valid_lft forever preferred_lft forever
inet 192.168.60.60/24 scope global secondary eth1:1 # VIP(192.168.60.60) 也飘到了 LVS-2 上面,此时发生了“脑裂”
valid_lft forever preferred_lft forever
inet6 fe80::5745:92ae:e725:b669/64 scope link noprefixroute
valid_lft forever preferred_lft forever
此时会收到一封 keepalive
“脑裂”的邮件
收到邮件后,我们在清除 iptables
规则,在 LVS-2
上已经没有了 VIP(192.168.60.60)
[root@LVS-2 ~]# iptables -F
[root@LVS-2 ~]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST> mtu 1500 qdisc mq state DOWN group default qlen 1000
link/ether 00:50:56:a3:07:c1 brd ff:ff:ff:ff:ff:ff
3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
link/ether 00:50:56:a3:4c:74 brd ff:ff:ff:ff:ff:ff
inet 192.168.60.88/24 brd 192.168.60.255 scope global noprefixroute eth1
valid_lft forever preferred_lft forever
inet6 fe80::5745:92ae:e725:b669/64 scope link noprefixroute
valid_lft forever preferred_lft forever
[root@LVS-2 ~]#
此时会收到一封恢复的邮件