k8s默认的健康检查机制是,每个容器都有一个监控进程,如果进程退出时返回码非零,则认为容器发生故障。
存活探测
监测pod是否处于运行状态,当liveness probe探测失败的时候,根据重启策略判断是否需要重启。适用于需要在容器发生故障时需要立即重启的状态。
用指定的方式(exec,tcp,http等)监测pod中的容器是否正常运行
yaml文件如下:
[root@master yam_files]# cat live-http.yaml
apiVersion: v1
kind: Pod
metadata:
name: liveness-http
namespace: default
labels:
app: nginx
spec:
containers:
- name: liveness
image: nginx
imagePullPolicy: IfNotPresent
ports:
- containerPort: 80
livenessProbe:
httpGet:
path: /index.html
port: 80
initialDelaySeconds: 5
periodSeconds: 10
readinessProbe:
httpGet:
path: /index.html
port: 80
initialDelaySeconds: 5
periodSeconds: 10
restartPolicy: Always
新建的pod里面运行一个nginx,而且通过http访问,设定启动探测为5s,周期为10s,存活探测周期为10s,启动探测为5s。
启动pod后,破坏他,删除这个探测的路径index.html
kubectl exec -it liveness-http -- /bin/bash
root@liveness-http:/# cd /usr/share/nginx/html
root@liveness-http:/usr/share/nginx/html# ls
50x.html index.html
root@liveness-http:/usr/share/nginx/html# rm index.html
探针发现错误后重启pod
kubectl get pods -l app=nginx -w
NAME READY STATUS RESTARTS AGE
liveness-http 1/1 Running 0 5m37s
nginx-test-6cf9d87fbf-26h6m 1/1 Running 0 127m
nginx-test-6cf9d87fbf-wn94b 1/1 Running 0 7d17h
liveness-http 0/1 Running 0 5m50s
liveness-http 0/1 Running 1 (2s ago) 5m52s
liveness-http 1/1 Running 1 (10s ago) 6m
随后用tcp做一个实验,写yaml文件如下
cat live-tcp.yaml
apiVersion: v1
kind: Pod
metadata:
name: liveness-tcp
spec:
containers:
- name: liveness
image: nginx
imagePullPolicy: IfNotPresent
ports:
- containerPort: 80
livenessProbe:
tcpSocket:
port: 80
initialDelaySeconds: 2
periodSeconds: 3
随后启动pod,并且停掉其中的nginx服务
kubectl exec -it liveness-tcp -- /bin/bash
root@liveness-tcp:/# nginx -s stop
2024/06/24 05:44:54 [notice] 45#45: signal process started
发现pod重启
kubectl get pods -w
>
NAME READY STATUS RESTARTS AGE
first 1/1 Running 0 158m
liveness-http 1/1 Running 1 (25m ago) 31m
liveness-tcp 1/1 Running 0 57s
nginx-test-6cf9d87fbf-26h6m 1/1 Running 0 153m
nginx-test-6cf9d87fbf-wn94b 1/1 Running 0 7d17h
liveness-tcp 0/1 Completed 0 2m
liveness-tcp 1/1 Running 1 (1s ago) 2m1s
就绪探测
readiness probe探测容器是否可以正常接受请求,如果探测失败,k8s立即停止将新的流量转发到该容器。从SVC移除
先建立一个service,再建立一个nginx的pod,通过service来转发流量
[root@master yam_files]# cat readiness-svc.yaml
apiVersion: v1
kind: Service
metadata:
name: readiness
namespace:
spec:
selector:
app: my-pod
ports:
- port: 80
targetPort: 80
[root@master yam_files]# cat ready-http.yaml
apiVersion: v1
kind: Pod
metadata:
name: my-pod
labels:
app: my-pod
spec:
containers:
- name: nginx-container
image: nginx
imagePullPolicy: IfNotPresent
ports:
- containerPort: 80
readinessProbe:
httpGet:
path: /index.html
port: 80
initialDelaySeconds: 30
periodSeconds: 10
failureThreshold: 2
successThreshold: 1
可以看到设置的就绪探测为30s之后, 且每10s探测一次
一开始的service并没有连接到pod(endpoints为空)
kubectl describe svc readiness
>
Name: readiness
Namespace: default
Labels: <none>
Annotations: <none>
Selector: app=my-pod
Type: ClusterIP
IP Family Policy: SingleStack
IP Families: IPv4
IP: 10.107.242.111
IPs: 10.107.242.111
Port: <unset> 80/TCP
TargetPort: 80/TCP
Endpoints:
Session Affinity: None
Events: <none>
随后就绪探测成功,service也成功连上
kubectl get pods -w
>
NAME READY STATUS RESTARTS AGE
nginx-test-6cf9d87fbf-26h6m 1/1 Running 0 3h28m
nginx-test-6cf9d87fbf-wn94b 1/1 Running 0 7d18h
my-pod 0/1 Pending 0 0s
my-pod 0/1 Pending 0 0s
my-pod 0/1 ContainerCreating 0 0s
my-pod 0/1 ContainerCreating 0 2s
my-pod 0/1 Running 0 3s
my-pod 1/1 Running 0 40s
kubectl describe svc readiness
>
Name: readiness
Namespace: default
Labels: <none>
Annotations: <none>
Selector: app=my-pod
Type: ClusterIP
IP Family Policy: SingleStack
IP Families: IPv4
IP: 10.107.242.111
IPs: 10.107.242.111
Port: <unset> 80/TCP
TargetPort: 80/TCP
Endpoints: 10.244.166.145:80
Session Affinity: None
Events: <none>
查看my-pod的ip,10.244.166.145,和上面svc的endpoint相同。在生产环境中,一般会部署副本,那么所有pod的ip都会在endpoint中,而这些ip也会写在防火墙中
kubectl get pods -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
my-pod 1/1 Running 0 14m 10.244.166.145 node1 <none> <none>
启动探测
startup probe用于监测容器是否成功启动并准备好接收流量,只会在启动的时候执行一次。
设置initialDelaySeconds:启动多久之后开始探测,避免容器初始化没完成就开始探测
periodSeconds:探测周期。默认10s
timeoutSeconds:默认1s
如果没有设置容器启动探测,则默认状态为成功success
apiVersion: v1
kind: Pod
metadata:
name: startupprobe
spec:
containers:
- name: startup
image: xianchao/tomcat-8.5-jre8:v1
imagePullPolicy: IfNotPresent
ports:
- containerPort: 8080
startupProbe:
exec:
command:
- "/bin/bash"
- "-c"
- " ps aux | grep tomcat"
initialDelaySeconds: 20 #容器启动后多久开始探测
periodSeconds: 20 #执行探测的时间间隔
timeoutSeconds: 10 #探针执行检测请求后,等待响应的超时时间
successThreshold: 1 #成功多少次才算成功
failureThreshold: 3 #失败多少次才算失败
用exe做探针,执行后发现40s之后运行成功,探测到的时间=initialDelaySeconds + periodSeconds,第一次initialSelaySeconds没有探测成功
kubectl get pods -w
>
NAME READY STATUS RESTARTS AGE
nginx-test-6cf9d87fbf-26h6m 1/1 Running 0 11h
nginx-test-6cf9d87fbf-wn94b 1/1 Running 0 8d
startupprobe 0/1 Running 0 23s
startupprobe 0/1 Running 0 40s
startupprobe 1/1 Running 0 40s
修改yaml文件,使得探测失败,可以看到开始重启
“aa ps aux | grep tomcat”
第一次重启的时间:initialDelaySeconds + (periodSeconds + timeoutSeconds)* failureThreshold
kubectl get pods -w
>
NAME READY STATUS RESTARTS AGE
startupprobe 0/1 ContainerCreating 0 0s
startupprobe 0/1 ContainerCreating 0 1s
startupprobe 0/1 Running 0 2s
startupprobe 0/1 Running 1 (0s ago) 80s
startupprobe 0/1 Running 2 (1s ago) 2m21s
利用tcpSocket进行探测
apiVersion: v1
kind: Pod
metadata:
name: startupprobe
spec:
containers:
- name: startup
image: xianchao/tomcat-8.5-jre8:v1
imagePullPolicy: IfNotPresent
ports:
- containerPort: 8080
startupProbe:
tcpSocket:
port: 8080
initialDelaySeconds: 20 #容器启动后多久开始探测
periodSeconds: 20 #执行探测的时间间隔
timeoutSeconds: 10 #探针执行检测请求后,等待响应的超时时间
successThreshold: 1 #成功多少次才算成功
failureThreshold: 3 #失败多少次才算失败
全生命周期健康监测
目前livenessProbe、ReadinessProbe和startupProbe都支持以下三种探针:
exec:如果执行成功,退出码为0则探测成功。
TCPSocketAction:通过容器的ip地址和端口号执行TCP检查,如果能够建立TCP连接,则表明容器健康。
HTTPGetAction:通过容器的ip地址、端口号以及调用HTTP Get方法,如果响应的状态码大于200小于400,则认为容器健康。
apiVersion: v1
kind: Pod
metadata:
name: life-demo
spec:
containers:
- name: lifecycle-demo-container
image: docker.io/xianchao/nginx:v1
imagePullPolicy: IfNotPresent
lifecycle:
postStart:
exec:
command: ["/bin/bash","-c","echo 'lifecycle hookshandler' > /usr/share/nginx/html/test.html"]
preStop:
exec:
command:
- "/bin/sh"
- "-c"
- "nginx -s stop"
postStart和preStop都是容器生命管理的钩子,PostStart
钩子在容器启动后立即调用。这是在容器的主进程启动之后,且在容器被视为就绪之前发生的。如果 PostStart
钩子执行失败,容器将被视为启动失败,Kubernetes 会根据容器的重启策略处理这个失败的容器。PreStop
钩子是为了在容器终止前执行清理任务(如关闭连接、释放资源等)而设计的。
apiVersion: v1
kind: Pod
metadata:
name: check
namespace: default
labels:
app: check
spec:
containers:
- name: check
image: busybox:1.28
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- sleep 10;exit
启动这个pod后,10s后容器就会退出,但是会一直重启
kubectl get pods -w
>
NAME READY STATUS RESTARTS AGE
nginx-test-6cf9d87fbf-26h6m 1/1 Running 0 11h
nginx-test-6cf9d87fbf-wn94b 1/1 Running 0 8d
check 0/1 Pending 0 0s
check 0/1 Pending 0 0s
check 0/1 ContainerCreating 0 0s
check 0/1 ContainerCreating 0 1s
check 1/1 Running 0 2s
check 0/1 Completed 0 12s
check 1/1 Running 1 (2s ago) 13s
check 0/1 Completed 1 (12s ago) 23s
check 0/1 CrashLoopBackOff 1 (14s ago) 36s
check 1/1 Running 2 (14s ago) 36s
check 0/1 Completed 2 (24s ago) 46s
check 0/1 CrashLoopBackOff 2 (12s ago) 58s
check 1/1 Running 3 (24s ago) 70s
check 0/1 Completed 3 (34s ago) 80s
check 0/1 CrashLoopBackOff 3 (16s ago) 95s
check 1/1 Running 4 (43s ago) 2m2s
一个包含三种探测的pod文件
apiVersion: v1
kind: Service
metadata:
name: springboot-live
labels:
app: springboot
spec:
type: NodePort
ports:
- name: server
port: 8080
targetPort: 8080
nodePort: 31180
- name: management
port: 8081
targetPort: 8081
nodePort: 31181
selector:
app: springboot
---
apiVersion: v1
kind: Pod
metadata:
name: springboot-live
labels:
app: springboot
spec:
containers:
- name: springboot
image: mydlqclub/springboot-helloworld:0.0.1
imagePullPolicy: IfNotPresent
ports:
- name: server
containerPort: 8080
- name: management
containerPort: 8081
readinessProbe:
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 10
httpGet:
scheme: HTTP
port: 8081
path: /actuator/health
livenessProbe:
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 10
httpGet:
scheme: HTTP
port: 8081
path: /actuator/health
startupProbe:
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 10
httpGet:
scheme: HTTP
port: 8081
path: /actuator/health
启动后发现svc的服务是正常的
kubectl describe svc springboot-live
Name: springboot-live
Namespace: default
Labels: app=springboot
Annotations: <none>
Selector: app=springboot
Type: NodePort
IP Family Policy: SingleStack
IP Families: IPv4
IP: 10.98.119.36
IPs: 10.98.119.36
Port: server 8080/TCP
TargetPort: 8080/TCP
NodePort: server 31180/TCP
Endpoints: 10.244.166.156:8080
Port: management 8081/TCP
TargetPort: 8081/TCP
NodePort: management 31181/TCP
Endpoints: 10.244.166.156:8081
Session Affinity: None
External Traffic Policy: Cluster
Events: <none>
随后进入pod,删掉运行的程序
kubectl exec -it springboot-live -- /bin/sh
/ # ls
app.jar dev home media opt root sbin sys usr
bin etc lib mnt proc run srv tmp var
/ # ps -ef | grep springboot
63 root 0:00 grep springboot
/ # ps -ef | grep hello
65 root 0:00 grep hello
/ # kill 1
/ # command terminated with exit code 137
查看pod状态,发现20s后重启
kubectl get pods -w
NAME READY STATUS RESTARTS AGE
springboot-live 1/1 Running 0 2m36s
springboot-live 0/1 Error 0 17m
springboot-live 0/1 Running 1 (1s ago) 17m
springboot-live 0/1 Running 1 (22s ago) 17m
springboot-live 1/1 Running 1 (22s ago) 17m