[root@vm511 ~]# kubectl get pods --all-namespaces -o wide
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
kube-system calico-node-f4228 2/2 Running 7 79m 192.168.2.106 vm511 <none> <none>
kube-system calico-node-gbqpd 2/2 Running 0 56m 192.168.2.109 vm514 <none> <none>
kube-system calico-node-mqr5x 1/2 CrashLoopBackOff 8 17m 192.168.2.107 vm512 <none> <none>
kube-system calico-node-nsj6q 2/2 Running 0 62m 192.168.2.108 vm513 <none> <none>
kube-system calico-node-vf2tn 1/2 CrashLoopBackOff 8 17m 192.168.2.110 vm515 <none> <none>
kube-system calico-typha-666749994b-hrxsx 1/1 Running 0 79m 192.168.2.107 vm512 <none> <none>
kube-system coredns-8567978547-5xm6x 1/1 Running 0 96m 172.22.0.2 vm511 <none> <none>
有两个节点总是报错 CrashLoopBackOff,跟踪日志如下:
root@vm511 ~]# kubectl log -f -n kube-system calico-node-mqr5x -c calico-node
log is DEPRECATED and will be removed in a future version. Use logs instead.
2020-07-03 03:31:48.926 [INFO][8] startup.go 251: Early log level set to info
2020-07-03 03:31:48.927 [INFO][8] startup.go 267: Using NODENAME environment for node name
2020-07-03 03:31:48.927 [INFO][8] startup.go 279: Determined node name: vm512
2020-07-03 03:31:48.931 [INFO][8] startup.go 302: Checking datastore connection
2020-07-03 03:31:48.945 [INFO][8] startup.go 326: Datastore connection verified
2020-07-03 03:31:48.945 [INFO][8] startup.go 99: Datastore is ready
2020-07-03 03:31:48.961 [INFO][8] startup.go 564: Using autodetected IPv4 address on interface br-2433bb129402: 172.18.0.1/16
2020-07-03 03:31:48.961 [INFO][8] startup.go 432: Node IPv4 changed, will check for conflicts
2020-07-03 03:31:48.966 [WARNING][8] startup.go 861: Calico node 'vm511' is already using the IPv4 address 172.18.0.1.
2020-07-03 03:31:48.966 [INFO][8] startup.go 205: Clearing out-of-date IPv4 address from this node IP="172.18.0.1/16"
2020-07-03 03:31:48.978 [WARNING][8] startup.go 1058: Terminating
Calico node failed to start
分析原因,是因为该节点上的网卡有问题,出现 br-xxxx网卡,如下:
[root@vm512 kubernetes-ha-kubeadm]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: ens33: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
link/ether 00:0c:29:d5:08:21 brd ff:ff:ff:ff:ff:ff
inet 192.168.2.107/24 brd 192.168.2.255 scope global noprefixroute ens33
valid_lft forever preferred_lft forever
inet6 fe80::6151:3703:6be0:de94/64 scope link noprefixroute
valid_lft forever preferred_lft forever
3: br-2433bb129402: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue state DOWN group default
link/ether 02:42:fe:8f:e0:bd brd ff:ff:ff:ff:ff:ff
inet 172.18.0.1/16 brd 172.18.255.255 scope global br-2433bb129402
valid_lft forever preferred_lft forever
4: docker0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue state DOWN group default
link/ether 02:42:67:49:b3:8d brd ff:ff:ff:ff:ff:ff
inet 172.17.0.1/16 brd 172.17.255.255 scope global docker0
valid_lft forever preferred_lft forever
5: docker_gwbridge: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue state DOWN group default
link/ether 02:42:2e:f1:9e:30 brd ff:ff:ff:ff:ff:ff
inet 172.19.0.1/16 brd 172.19.255.255 scope global docker_gwbridge
valid_lft forever preferred_lft forever
删除该网卡
1、查看网桥状态
brctl show
root@vm512 kubernetes-ha-kubeadm]# brctl show
bridge name bridge id STP enabled interfaces
br-2433bb129402 8000.0242fe8fe0bd no
docker0 8000.02426749b38d no
docker_gwbridge 8000.02422ef19e30 no
2,关闭br-2433bb129402网桥
ifconfig br-2433bb129402 down
3,删除网桥
brctl delbr br-2433bb129402
同理,清理其他节点的网卡。再次查看集群状态,如果还有CrashLoopBackOff的pod,参考如下命令删掉
kubectl delete pod calico-node-xxxxx -n kube-system
再次检查集群状态:
[root@vm511 ~]# kubectl get pods --all-namespaces -o wide
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
kube-system calico-node-f4228 2/2 Running 7 103m 192.168.2.106 vm511 <none> <none>
kube-system calico-node-gbqpd 2/2 Running 0 80m 192.168.2.109 vm514 <none> <none>
kube-system calico-node-mqr5x 2/2 Running 11 42m 192.168.2.107 vm512 <none> <none>
kube-system calico-node-n96tl 1/2 Running 0 6s 192.168.2.110 vm515 <none> <none>
kube-system calico-node-nsj6q 2/2 Running 0 87m 192.168.2.108 vm513 <none> <none>
kube-system calico-typha-666749994b-hrxsx 1/1 Running 0 103m 192.168.2.107 vm512 <none> <none>
kube-system coredns-8567978547-5xm6x 1/1 Running 0 121m 172.22.0.2 vm511 <none> <none>
calico-node节点均恢复正常。