1. 环境说明
Kubernetes
源码版本:remotes/origin/release-1.25
Kubernetes
编译出来的Kubelet
版本:Kubernetes v1.24.0-beta.0.2463+ee7799bab469d7
Kubernetes
集群实验环境:使用Kubernetes v1.25.4
二进制的方式搭建了一个单节点集群
K8S 单节点单节点搭建可以参考:Kubernetes v1.25 搭建单节点集群用于Debug K8S源码
Golang
版本:go1.19.3 linux/amd64
IDEA
版本:2022.2.3
Delve
版本:1.9.1
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# dlv version
Delve Debugger
Version: 1.9.1
Build: $Id: d81b9fd12bfa603f3cf7a4bc842398bd61c42940 $
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# go version
go version go1.19.3 linux/amd64
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# kubectl version
WARNING: This version information is deprecated and will be replaced with the output from kubectl version --short. Use --output=yaml|json to get the full version.
Client Version: version.Info{Major:"1", Minor:"25", GitVersion:"v1.25.4", GitCommit:"872a965c6c6526caa949f0c6ac028ef7aff3fb78", GitTreeState:"clean", BuildDate:"2022-11-09T13:36:36Z", GoVersion:"go1.19.3", Compiler:"gc", Platform:"linux/amd64"}
Kustomize Version: v4.5.7
Server Version: version.Info{Major:"1", Minor:"25", GitVersion:"v1.25.4", GitCommit:"872a965c6c6526caa949f0c6ac028ef7aff3fb78", GitTreeState:"clean", BuildDate:"2022-11-09T13:29:58Z", GoVersion:"go1.19.3", Compiler:"gc", Platform:"linux/amd64"}
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# kubectl get nodes -owide
NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME
k8s-master1 Ready <none> 31h v1.25.4 192.168.11.71 <none> CentOS Linux 7 (Core) 3.10.0-1160.80.1.el7.x86_64 containerd://1.6.10
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# kubectl get componentstatus
Warning: v1 ComponentStatus is deprecated in v1.19+
NAME STATUS MESSAGE ERROR
etcd-0 Healthy {"health":"true","reason":""}
controller-manager Healthy ok
scheduler Healthy ok
[root@k8s-master1 kubernetes]#
Kubelet
启动参数配置如下:
[root@k8s-master1 kubernetes]# ps -ef|grep "/usr/local/bin/kubelet"
root 7972 1 6 07:06 ? 00:00:06 /usr/local/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.kubeconfig --kubeconfig=/etc/kubernetes/kubelet.kubeconfig --config=/etc/kubernetes/kubelet-conf.yml --container-runtime-endpoint=unix:///run/containerd/containerd.sock --node-labels=node.kubernetes.io/node= --v=8
root 9549 6424 0 07:07 pts/0 00:00:00 grep --color=auto /usr/local/bin/kubelet
[root@k8s-master1 kubernetes]#
Kubelet
参数配置如下:
apiVersion: kubelet.config.k8s.io/v1beta1
kind: KubeletConfiguration
address: 0.0.0.0
port: 10250
readOnlyPort: 10255
authentication:
anonymous:
enabled: false
webhook:
cacheTTL: 2m0s
enabled: true
x509:
clientCAFile: /etc/kubernetes/pki/ca.pem
authorization:
mode: Webhook
webhook:
cacheAuthorizedTTL: 5m0s
cacheUnauthorizedTTL: 30s
cgroupDriver: systemd
cgroupsPerQOS: true
clusterDNS:
- 10.96.0.10
clusterDomain: cluster.local
containerLogMaxFiles: 5
containerLogMaxSize: 10Mi
contentType: application/vnd.kubernetes.protobuf
cpuCFSQuota: true
cpuManagerPolicy: none
cpuManagerReconcilePeriod: 10s
enableControllerAttachDetach: true
enableDebuggingHandlers: true
enforceNodeAllocatable:
- pods
eventBurst: 10
eventRecordQPS: 5
evictionHard:
imagefs.available: 15%
memory.available: 100Mi
nodefs.available: 10%
nodefs.inodesFree: 5%
evictionPressureTransitionPeriod: 5m0s
failSwapOn: true
fileCheckFrequency: 20s
hairpinMode: promiscuous-bridge
healthzBindAddress: 127.0.0.1
healthzPort: 10248
httpCheckFrequency: 20s
imageGCHighThresholdPercent: 85
imageGCLowThresholdPercent: 80
imageMinimumGCAge: 2m0s
iptablesDropBit: 15
iptablesMasqueradeBit: 14
kubeAPIBurst: 10
kubeAPIQPS: 5
makeIPTablesUtilChains: true
maxOpenFiles: 1000000
maxPods: 110
nodeStatusUpdateFrequency: 10s
oomScoreAdj: -999
podPidsLimit: -1
registryBurst: 10
registryPullQPS: 5
resolvConf: /etc/resolv.conf
rotateCertificates: true
runtimeRequestTimeout: 2m0s
serializeImagePulls: true
staticPodPath: /etc/kubernetes/manifests
streamingConnectionIdleTimeout: 4h0m0s
syncFrequency: 1m0s
volumeStatsAggPeriod: 1m0s
2. 组件概览
ProberManager
用于管理Pod
的探针,熟悉Kubernetes
的小伙伴肯定非常熟悉,目前K8S
一共有三类探针,分别是ReadinessProbe, StartupProbe, LivenessProbe
。ProbeManager
会为每一个容器都创建一个worker
,这个worker
会周期性探测指定的容器然后把探测到的结果保存在缓存中。PodManager
会使用缓存到的探测结果给PodStatus
设置合适的状态。
3. 源码剖析
3.1. Worker
Worker |
显然,根据组件概览中的介绍,ProberManager
比较重要的是Worker
,因为所有苦活脏活都是由Worker
来完成的。ProberManager
只是在Worker
的工作成果上借花献佛。所以我们首先来看看底层劳动人们Worker
的真实生活,看看它每天都是如何度过的。
3.1.1. worker
worker |
由于Worker
是以容器的粒度建立的,因此Worker
中需要记录Pod, Container
也是非常合理的。与此同时,Worker
通过spec
来指定探针逻辑。通过resultManager
来保存探针结果。
type worker struct {
// Channel for stopping the probe.
stopCh chan struct{}
// Channel for triggering the probe manually.
manualTriggerCh chan struct{}
// The pod containing this probe (read-only)
pod *v1.Pod
// The container to probe (read-only)
container v1.Container
// Describes the probe configuration (read-only)
spec *v1.Probe
// The type of the worker.
probeType probeType
// The probe value during the initial delay.
initialValue results.Result
// Where to store this workers results.
resultsManager results.Manager
probeManager *manager
// The last known container ID for this worker.
containerID kubecontainer.ContainerID
// The last probe result for this worker.
lastResult results.Result
resultRun int
onHold bool
proberResultsSuccessfulMetricLabels metrics.Labels
proberResultsFailedMetricLabels metrics.Labels
proberResultsUnknownMetricLabels metrics.Labels
proberDurationSuccessfulMetricLabels metrics.Labels
proberDurationUnknownMetricLabels metrics.Labels
}
3.1.2. newWorker
newWorker |
从newWorker
这个实例化方法可以看出,Worker
是以Container
的维度建立的,同时每个Probe
都会建立一个Worker
func newWorker(
m *manager,
probeType probeType,
pod *v1.Pod,
container v1.Container) *worker {
w := &worker{
stopCh: make(chan struct{}, 1), // Buffer so stop() can be non-blocking.
manualTriggerCh: make(chan struct{}, 1), // Buffer so prober_manager can do non-blocking calls to doProbe.
pod: pod,
container: container,
probeType: probeType,
probeManager: m,
}
switch probeType {
case readiness:
w.spec = container.ReadinessProbe
w.resultsManager = m.readinessManager
w.initialValue = results.Failure
case liveness:
w.spec = container.LivenessProbe
w.resultsManager = m.livenessManager
w.initialValue = results.Success
case startup:
w.spec = container.StartupProbe
w.resultsManager = m.startupManager
w.initialValue = results.Unknown
}
podName := getPodLabelName(w.pod)
basicMetricLabels := metrics.Labels{
"probe_type": w.probeType.String(),
"container": w.container.Name,
"pod": podName,
"namespace": w.pod.Namespace,
"pod_uid": string(w.pod.UID),
}
proberDurationLabels := metrics.Labels{
"probe_type": w.probeType.String(),
"container": w.container.Name,
"pod": podName,
"namespace": w.pod.Namespace,
}
w.proberResultsSuccessfulMetricLabels = deepCopyPrometheusLabels(basicMetricLabels)
w.proberResultsSuccessfulMetricLabels["result"] = probeResultSuccessful
w.proberResultsFailedMetricLabels = deepCopyPrometheusLabels(basicMetricLabels)
w.proberResultsFailedMetricLabels["result"] = probeResultFailed
w.proberResultsUnknownMetricLabels = deepCopyPrometheusLabels(basicMetricLabels)
w.proberResultsUnknownMetricLabels["result"] = probeResultUnknown
w.proberDurationSuccessfulMetricLabels = deepCopyPrometheusLabels(proberDurationLabels)
w.proberDurationUnknownMetricLabels = deepCopyPrometheusLabels(proberDurationLabels)
return w
}
3.1.3. run
run |
worker
通过执行run
方法开始运行,进来之后就生成了一个定时器,只要定时时间一到就会触发worker
执行doProbe
方法,显然,doProbe
中就是真正的探测方法了。
值得注意的是,除了定时器可以出发doProbe
,这里还可以手动触发。
func (w *worker) run() {
probeTickerPeriod := time.Duration(w.spec.PeriodSeconds) * time.Second
if probeTickerPeriod > time.Since(w.probeManager.start) {
time.Sleep(time.Duration(rand.Float64() * float64(probeTickerPeriod)))
}
probeTicker := time.NewTicker(probeTickerPeriod)
defer func() {
// Clean up.
probeTicker.Stop()
if !w.containerID.IsEmpty() {
w.resultsManager.Remove(w.containerID)
}
w.probeManager.removeWorker(w.pod.UID, w.container.Name, w.probeType)
ProberResults.Delete(w.proberResultsSuccessfulMetricLabels)
ProberResults.Delete(w.proberResultsFailedMetricLabels)
ProberResults.Delete(w.proberResultsUnknownMetricLabels)
ProberDuration.Delete(w.proberDurationSuccessfulMetricLabels)
ProberDuration.Delete(w.proberDurationUnknownMetricLabels)
}()
probeLoop:
for w.doProbe() {
// Wait for next probe tick.
select {
case <-w.stopCh:
break probeLoop
case <-probeTicker.C:
case <-w.manualTriggerCh:
// continue
}
}
}
3.1.4. doProbe
doProbe |
我们来看看Worker
是如何探测Container
的探针的。
- 1、首先获取
Pod
的状态,如果获取状态失败,直接退出 - 2、如果
Pod
已经运行失败了或者Pod
已经成功退出,那么也没有必要执行Pod
的探针了 - 3、从
Pod
状态中获取当前容器的状态,如果获取失败,那么直接退出 - 4、如果容器已经被重启(换言之重启了Pod),那么移除
ResultManager
中保存的以前Container
的结果 - 5、如果发现当前要探测的容器并没有处于运行当中,直接设置当前容器的状态为失败
- 6、如果当前
Pod
已经被干掉,并且当前探针是LivenessProbe
或者StartupProbe
,那么认为当前探测是有效的,设置ResultManager
的状态结果为成功 - 7、如果当前
Pod
刚被启动,还没有过InitialDelaySeconds
指定的时间,那么无须探测,直接退出 - 8、执行探针,获取探针结果,并发探针结果记录到
ResultManager
当中
func (w *worker) doProbe() (keepGoing bool) {
defer func() { recover() }() // Actually eat panics (HandleCrash takes care of logging)
defer runtime.HandleCrash(func(_ interface{}) { keepGoing = true })
startTime := time.Now()
status, ok := w.probeManager.statusManager.GetPodStatus(w.pod.UID)
if !ok {
// Either the pod has not been created yet, or it was already deleted.
klog.V(3).InfoS("No status for pod", "pod", klog.KObj(w.pod))
return true
}
// Worker should terminate if pod is terminated.
if status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded {
klog.V(3).InfoS("Pod is terminated, exiting probe worker",
"pod", klog.KObj(w.pod), "phase", status.Phase)
return false
}
c, ok := podutil.GetContainerStatus(status.ContainerStatuses, w.container.Name)
if !ok || len(c.ContainerID) == 0 {
// Either the container has not been created yet, or it was deleted.
klog.V(3).InfoS("Probe target container not found",
"pod", klog.KObj(w.pod), "containerName", w.container.Name)
return true // Wait for more information.
}
if w.containerID.String() != c.ContainerID {
if !w.containerID.IsEmpty() {
w.resultsManager.Remove(w.containerID)
}
w.containerID = kubecontainer.ParseContainerID(c.ContainerID)
w.resultsManager.Set(w.containerID, w.initialValue, w.pod)
// We've got a new container; resume probing.
w.onHold = false
}
if w.onHold {
// Worker is on hold until there is a new container.
return true
}
if c.State.Running == nil {
klog.V(3).InfoS("Non-running container probed",
"pod", klog.KObj(w.pod), "containerName", w.container.Name)
if !w.containerID.IsEmpty() {
w.resultsManager.Set(w.containerID, results.Failure, w.pod)
}
// Abort if the container will not be restarted.
return c.State.Terminated == nil ||
w.pod.Spec.RestartPolicy != v1.RestartPolicyNever
}
// Graceful shutdown of the pod.
if w.pod.ObjectMeta.DeletionTimestamp != nil && (w.probeType == liveness || w.probeType == startup) {
klog.V(3).InfoS("Pod deletion requested, setting probe result to success",
"probeType", w.probeType, "pod", klog.KObj(w.pod), "containerName", w.container.Name)
if w.probeType == startup {
klog.InfoS("Pod deletion requested before container has fully started",
"pod", klog.KObj(w.pod), "containerName", w.container.Name)
}
// Set a last result to ensure quiet shutdown.
w.resultsManager.Set(w.containerID, results.Success, w.pod)
// Stop probing at this point.
return false
}
// Probe disabled for InitialDelaySeconds.
if int32(time.Since(c.State.Running.StartedAt.Time).Seconds()) < w.spec.InitialDelaySeconds {
return true
}
if c.Started != nil && *c.Started {
if w.probeType == startup {
return true
}
} else {
if w.probeType != startup {
return true
}
}
result, err := w.probeManager.prober.probe(w.probeType, w.pod, status, w.container, w.containerID)
if err != nil {
// Prober error, throw away the result.
return true
}
switch result {
case results.Success:
ProberResults.With(w.proberResultsSuccessfulMetricLabels).Inc()
ProberDuration.With(w.proberDurationSuccessfulMetricLabels).Observe(time.Since(startTime).Seconds())
case results.Failure:
ProberResults.With(w.proberResultsFailedMetricLabels).Inc()
default:
ProberResults.With(w.proberResultsUnknownMetricLabels).Inc()
ProberDuration.With(w.proberDurationUnknownMetricLabels).Observe(time.Since(startTime).Seconds())
}
if w.lastResult == result {
w.resultRun++
} else {
w.lastResult = result
w.resultRun = 1
}
if (result == results.Failure && w.resultRun < int(w.spec.FailureThreshold)) ||
(result == results.Success && w.resultRun < int(w.spec.SuccessThreshold)) {
return true
}
w.resultsManager.Set(w.containerID, result, w.pod)
if (w.probeType == liveness || w.probeType == startup) && result == results.Failure {
w.onHold = true
w.resultRun = 0
}
return true
}
3.2. ResultManager
ResultManager |
Worker
最终探测到的结果是通过ResultManager
来保存的。
可以看到,ResultManager
的实现非常简单。实际上就是一个缓存,key为容器ID
,Value
为探针结果,剩下的逻辑并不复杂,都是一些简单的增删改查。
type Manager interface {
Get(kubecontainer.ContainerID) (Result, bool)
Set(kubecontainer.ContainerID, Result, *v1.Pod)
Remove(kubecontainer.ContainerID)
Updates() <-chan Update
}
// Result is the type for probe results.
type Result int
const (
// Unknown is encoded as -1 (type Result)
Unknown Result = iota - 1
// Success is encoded as 0 (type Result)
Success
// Failure is encoded as 1 (type Result)
Failure
)
func (r Result) String() string {
switch r {
case Success:
return "Success"
case Failure:
return "Failure"
default:
return "UNKNOWN"
}
}
// ToPrometheusType translates a Result to a form which is better understood by prometheus.
func (r Result) ToPrometheusType() float64 {
switch r {
case Success:
return 0
case Failure:
return 1
default:
return -1
}
}
// Update is an enum of the types of updates sent over the Updates channel.
type Update struct {
ContainerID kubecontainer.ContainerID
Result Result
PodUID types.UID
}
// Manager implementation.
type manager struct {
// guards the cache
sync.RWMutex
// map of container ID -> probe Result
cache map[kubecontainer.ContainerID]Result
// channel of updates
updates chan Update
}
var _ Manager = &manager{}
// NewManager creates and returns an empty results manager.
func NewManager() Manager {
return &manager{
cache: make(map[kubecontainer.ContainerID]Result),
updates: make(chan Update, 20),
}
}
func (m *manager) Get(id kubecontainer.ContainerID) (Result, bool) {
m.RLock()
defer m.RUnlock()
result, found := m.cache[id]
return result, found
}
func (m *manager) Set(id kubecontainer.ContainerID, result Result, pod *v1.Pod) {
if m.setInternal(id, result) {
m.updates <- Update{id, result, pod.UID}
}
}
// Internal helper for locked portion of set. Returns whether an update should be sent.
func (m *manager) setInternal(id kubecontainer.ContainerID, result Result) bool {
m.Lock()
defer m.Unlock()
prev, exists := m.cache[id]
if !exists || prev != result {
m.cache[id] = result
return true
}
return false
}
func (m *manager) Remove(id kubecontainer.ContainerID) {
m.Lock()
defer m.Unlock()
delete(m.cache, id)
}
func (m *manager) Updates() <-chan Update {
return m.updates
}
3.3. Manager
Manager |
type Manager interface {
AddPod(pod *v1.Pod)
StopLivenessAndStartup(pod *v1.Pod)
RemovePod(pod *v1.Pod)
CleanupPods(desiredPods map[types.UID]sets.Empty)
UpdatePodStatus(types.UID, *v1.PodStatus)
}
3.3.1. AddPod
AddPod |
来看看当添加一个Pod
时,ProberManager
是如何处理的?
可以看到,逻辑非常简单,遍历Pod
中所有的Contaienr
,然后每个Container
的每个Probe
都建立一个worker
,然后启动worker
,定时探测容器的探针。
func (m *manager) AddPod(pod *v1.Pod) {
m.workerLock.Lock()
defer m.workerLock.Unlock()
key := probeKey{podUID: pod.UID}
for _, c := range pod.Spec.Containers {
key.containerName = c.Name
if c.StartupProbe != nil {
key.probeType = startup
if _, ok := m.workers[key]; ok {
klog.V(8).ErrorS(nil, "Startup probe already exists for container",
"pod", klog.KObj(pod), "containerName", c.Name)
return
}
w := newWorker(m, startup, pod, c)
m.workers[key] = w
go w.run()
}
if c.ReadinessProbe != nil {
key.probeType = readiness
if _, ok := m.workers[key]; ok {
klog.V(8).ErrorS(nil, "Readiness probe already exists for container",
"pod", klog.KObj(pod), "containerName", c.Name)
return
}
w := newWorker(m, readiness, pod, c)
m.workers[key] = w
go w.run()
}
if c.LivenessProbe != nil {
key.probeType = liveness
if _, ok := m.workers[key]; ok {
klog.V(8).ErrorS(nil, "Liveness probe already exists for container",
"pod", klog.KObj(pod), "containerName", c.Name)
return
}
w := newWorker(m, liveness, pod, c)
m.workers[key] = w
go w.run()
}
}
}
3.3.2. StopLivenessAndStartup
StopLivenessAndStartup |
停止Pod
的所有容器的LivenessProbe, StartupProbe
的worker
func (m *manager) StopLivenessAndStartup(pod *v1.Pod) {
m.workerLock.RLock()
defer m.workerLock.RUnlock()
key := probeKey{podUID: pod.UID}
for _, c := range pod.Spec.Containers {
key.containerName = c.Name
for _, probeType := range [...]probeType{liveness, startup} {
key.probeType = probeType
if worker, ok := m.workers[key]; ok {
worker.stop()
}
}
}
}
3.3.3. RemovePod
RemovePod |
停止Pod
的所有容器的LivenessProbe, StartupProbe, ReadinessProbe
的worker
func (m *manager) RemovePod(pod *v1.Pod) {
m.workerLock.RLock()
defer m.workerLock.RUnlock()
key := probeKey{podUID: pod.UID}
for _, c := range pod.Spec.Containers {
key.containerName = c.Name
for _, probeType := range [...]probeType{readiness, liveness, startup} {
key.probeType = probeType
if worker, ok := m.workers[key]; ok {
worker.stop()
}
}
}
}
3.3.4. CleanupPods
CleanupPods |
func (m *manager) CleanupPods(desiredPods map[types.UID]sets.Empty) {
m.workerLock.RLock()
defer m.workerLock.RUnlock()
for key, worker := range m.workers {
if _, ok := desiredPods[key.podUID]; !ok {
worker.stop()
}
}
}
3.3.5. UpdatePodStatus
UpdatePodStatus |
根据ResultManager
中的状态更新Pod
状态
func (m *manager) UpdatePodStatus(podUID types.UID, podStatus *v1.PodStatus) {
for i, c := range podStatus.ContainerStatuses {
var started bool
if c.State.Running == nil {
started = false
} else if result, ok := m.startupManager.Get(kubecontainer.ParseContainerID(c.ContainerID)); ok {
started = result == results.Success
} else {
// The check whether there is a probe which hasn't run yet.
_, exists := m.getWorker(podUID, c.Name, startup)
started = !exists
}
podStatus.ContainerStatuses[i].Started = &started
if started {
var ready bool
if c.State.Running == nil {
ready = false
} else if result, ok := m.readinessManager.Get(kubecontainer.ParseContainerID(c.ContainerID)); ok && result == results.Success {
ready = true
} else {
// The check whether there is a probe which hasn't run yet.
w, exists := m.getWorker(podUID, c.Name, readiness)
ready = !exists // no readinessProbe -> always ready
if exists {
// Trigger an immediate run of the readinessProbe to update ready state
select {
case w.manualTriggerCh <- struct{}{}:
default: // Non-blocking.
klog.InfoS("Failed to trigger a manual run", "probe", w.probeType.String())
}
}
}
podStatus.ContainerStatuses[i].Ready = ready
}
}
// init containers are ready if they have exited with success or if a readiness probe has
// succeeded.
for i, c := range podStatus.InitContainerStatuses {
var ready bool
if c.State.Terminated != nil && c.State.Terminated.ExitCode == 0 {
ready = true
}
podStatus.InitContainerStatuses[i].Ready = ready
}
}