
1.Cluster环境
k8s版本:v1.15.4
cluster组成:3 controller + 5 worker + 2 edge
2. 错误信息
发现场景:Cluster在升级k8s版本,顺序是升级control -> worker - > edge (one by one).升级node前需要先drain 该node的traffic,升级control node成功,在drain worker01时卡住。
[root@controller01 ~]$ kubectl get nodes
NAME STATUS ROLES AGE VERSION
controller01 Ready <none> 69d v1.16.4
controller02 Ready <none> 69d v1.16.4
controller03 Ready <none> 69d v1.16.4
edge01 Ready <none> 69d v1.15.4
edge02 Ready <none> 69d v1.15.4
worker01 Ready,SchedulingDisabled <none> 69d v1.15.4
worker02 Ready <none> 69d v1.15.4
worker03 Ready <none> 69d v1.15.4
worker04 Ready <none> 69d v1.15.4
worker05 Ready <none> 21d v1.15.4
错误提示:在evict pod acp-etcdv2-2时失败,k8s抱怨违反了pdb(pod's disruption budget)
[root@controller01 ~]$ /usr/local/bin/kubectl drain --force --ignore-daemonsets --delete-local-data worker01
node/worker01 already cordoned
WARNING: ignoring DaemonSet-managed Pods: kube-system/kube-proxy-zjbwp, kube-system/local-volume-provisioner-ccvwz, ncms/bcmt-ip-man-agent-2v4rv, ncms/bcmt-operator-tlk5k, paas/belk-efkc-belk-fluentd-daemonset-bkqsb, paas/csf-compaas-cluster-cpro-node-exporter-n9mw4, rook-ceph/csi-rbdplugin-z6xch, rook-ceph/rook-discover-65gs9
evicting pod "acp-etcdv2-2"
error when evicting pod "acp-etcdv2-2" (will retry after 5s): Cannot evict pod as it would violate the pod's disruption budget.
evicting pod "acp-etcdv2-2"
error when evicting pod "acp-etcdv2-2" (will retry after 5s): Cannot evict pod as it would violate the pod's disruption budget.
3. Debug过程
关于PDB的概念可以看下,简单说就是定义了一个replicated 应用的pod允许有多少down掉/至少多少running才能保证该应用/pod可以提供正常的服务。
Disruptionskubernetes.io
首先,查看用户定义的PDB,spec.minAvailable = 2, pod总数是3个。
[root@controller03 /var/log]$ kubectl get pdb -ndis-apcore acp-etcdv2 -o yaml
apiVersion: policy/v1beta1
kind: PodDisruptionBudget
metadata:
creationTimestamp: "2020-08-20T11:25:53Z"
generation: 1
labels:
app: acp-etcdv2
chart: acp-etcdv2-20.3.300
heritage: Tiller
release: acp-etcdv2
name: acp-etcdv2
namespace: dis-apcore
resourceVersion: "134546556"
selfLink: /apis/policy/v1beta1/namespaces/dis-apcore/poddisruptionbudgets/acp-etcdv2
uid: b9401269-e194-4daa-ade8-6271078f8ea8
spec:
minAvailable: 2
selector:
matchLabels:
app: acp-etcdv2
release: acp-etcdv2
status:
currentHealthy: 2
desiredHealthy: 2
disruptionsAllowed: 0
expectedPods: 3
observedGeneration: 1
查看pod,显示三个pod都在Running状态:
[root@controller02 ~]$ kubectl get po -n dis-apcore -l app=acp-etcdv2
NAME READY STATUS RESTARTS AGE
acp-etcdv2-0 1/1 Running 0 56d
acp-etcdv2-1 1/1 Running 0 45h
acp-etcdv2-2 1/1 Running 0 56d
但是创建pod的controller statefulset acp-etcdv2显示的数据只有2个pod是ready状态。
[root@controller02 ~]$ kubectl get sts -ndis-apcore
NAME READY AGE
acp-etcdv2 2/3 69d
通过kubectl get pod -o yaml 发现是acp-etcdv2-2的pod ready status是false。此处不正常,因为根据k8s源码kubernetes/kubernetes
PodReady
is set toTrue
ifContainersReady
isTrue
and all your readiness gates and ready (you're not using any pod readiness gates). 该场景下ContainerReady 是True。
[root@controller01 ~]$ kubectl get pod acp-etcdv2-2 -n dis-apcore -o yaml
apiVersion: v1
kind: Pod
metadata:
annotations:
kubernetes.io/psp: privileged
seccomp.security.alpha.kubernetes.io/pod: docker/default
creationTimestamp: "2020-09-02T11:55:43Z"
generateName: acp-etcdv2-
labels:
app: acp-etcdv2
controller-revision-hash: acp-etcdv2-659df99d8b
release: acp-etcdv2
statefulset.kubernetes.io/pod-name: acp-etcdv2-2
name: acp-etcdv2-2
namespace: dis-apcore
ownerReferences:
- apiVersion: apps/v1
blockOwnerDeletion: true
controller: true
kind: StatefulSet
name: acp-etcdv2
uid: 443dca9f-77d1-41be-9446-e5e42d0b7c9e
resourceVersion: "130598356"
selfLink: /api/v1/namespaces/dis-apcore/pods/acp-etcdv2-2
uid: d46bb2cd-0cfe-4ce2-a84e-b48f340cc4ed
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app: acp-etcdv2
release: acp-etcdv2
topologyKey: kubernetes.io/hostname
containers:
- command:
- /bin/sh
- -ec
- |
for i in 0 1 2; do
h=$(PEER_PREFIX)-${i}.$(PEER_SUBDOMAIN)
until ping -W 1 -c 1 $h > /dev/null; do
echo "Waiting for $h to come up"
sleep 1
done
echo "$h is up"
done
exec /usr/local/bin/etcd --cert-file=/etc/apaas/cert.d/tls.crt --key-file=/etc/apaas/cert.d/tls.key
env:
- name: ETCD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: PEER_PREFIX
value: acp-etcdv2
- name: PEER_SUBDOMAIN
value: acp-etcdv2-peer
- name: ETCD_LISTEN_PEER_URLS
value: http://$(ETCD_NAME).$(PEER_SUBDOMAIN):2380
- name: ETCD_DATA_DIR
value: /var/lib/httpetcd
- name: ETCD_LISTEN_CLIENT_URLS
value: http://localhost:2379,http://$(ETCD_NAME):2379
- name: ETCD_INITIAL_CLUSTER
value: acp-etcdv2-0=http://acp-etcdv2-0.$(PEER_SUBDOMAIN):2380,acp-etcdv2-1=http://acp-etcdv2-1.$(PEER_SUBDOMAIN):2380,acp-etcdv2-2=http://acp-etcdv2-2.$(PEER_SUBDOMAIN):2380
- name: ETCD_INITIAL_ADVERTISE_PEER_URLS
value: http://$(ETCD_NAME).$(PEER_SUBDOMAIN):2380
- name: ETCD_ADVERTISE_CLIENT_URLS
value: http://$(PEER_PREFIX):2379
- name: ETCD_INITIAL_CLUSTER_STATE
value: new
- name: ETCD_CLUSTER_TOKEN
value: acp-etcdv2
- name: ETCD_NUM_SERVERS
value: "3"
- name: ETCD_LOG_LEVEL
value: INFO
image: bcmt-registry:5000/3pp/etcdv2:v2.3.7iputils
imagePullPolicy: IfNotPresent
name: acp-etcdv2-server
ports:
- containerPort: 2379
name: client
protocol: TCP
- containerPort: 2380
name: peer
protocol: TCP
resources:
limits:
cpu: "1"
memory: 4Gi
requests:
cpu: 100m
memory: 512Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /var/lib/httpetcd
name: datadir
subPath: data
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: acp-etcdv2-token-qwk7q
readOnly: true
dnsPolicy: ClusterFirst
enableServiceLinks: true
hostname: acp-etcdv2-2
imagePullSecrets:
- name: harbor-docker-registry
nodeName: worker01
priority: 0
restartPolicy: Always
schedulerName: default-scheduler
securityContext:
fsGroup: 1000
runAsUser: 1000
serviceAccount: acp-etcdv2
serviceAccountName: acp-etcdv2
subdomain: acp-etcdv2-peer
terminationGracePeriodSeconds: 30
volumes:
- name: datadir
persistentVolumeClaim:
claimName: datadir-acp-etcdv2-2
- name: acp-etcdv2-token-qwk7q
secret:
defaultMode: 420
secretName: acp-etcdv2-token-qwk7q
status:
conditions:
- lastProbeTime: null
lastTransitionTime: "2020-09-02T11:55:43Z"
status: "True"
type: Initialized
- lastProbeTime: null
lastTransitionTime: "2020-10-27T09:05:27Z"
status: "False"
type: Ready
- lastProbeTime: null
lastTransitionTime: "2020-09-02T11:56:10Z"
status: "True"
type: ContainersReady
- lastProbeTime: null
lastTransitionTime: "2020-09-02T11:55:43Z"
status: "True"
type: PodScheduled
containerStatuses:
- containerID: docker://13804be64b2e305de664d6b700c50a2242dabcbc121c6c70535da06f211e2819
image: bcmt-registry:5000/3pp/etcdv2:v2.3.7iputils
imageID: docker-pullable://bcmt-registry:5000/3pp/etcdv2@sha256:24fac7fb77d240d1265469f7824d40ae8f37220e3dcaf67bc33c72e9985c8c87
lastState: {}
name: acp-etcdv2-server
ready: true
restartCount: 0
state:
running:
startedAt: "2020-09-02T11:56:10Z"
hostIP: 40.40.40.154
phase: Running
podIP: 192.168.5.61
podIPs:
- ip: 192.168.5.61
qosClass: Burstable
startTime: "2020-09-02T11:55:43Z"
由此推断:
- pod acp-etcdv2-2的状态不正常是drain node失败的直接原因,因为pdb定义了minAvailable=2,当前currentHealthy=2,所以再去drain node时evict pod就会失败。(当然此处k8s没有识别被evict的pod就是不health的的pod,那就是另外的问题了,不在此展开)
- pod acp-etcdv2-2的状态不正常,是apiserver没有完全sync pod最后的状态(用describe pod命令显示是ready状态,但是用get pod命令显示的就是not ready状态),推测是k8s的部分变量没有得到更新,大概率是k8s的bug;
然后,去github k8s项目下查找相关issue,发现有相关的issue 82405, merge进了v1.15.8
最后,有何Workaround解决该问题?
- 重启三个controller上的apiserver,结果发现未生效
- 重启pod所在host的kubelet service,结果生效。其实重启kubelet是导致了该pod重新create,所以删除该pod触发recreate应该也能解决问题。