kubelet PLEG实现分析

最新推荐文章于 2023-11-28 15:35:58 发布

分享放大价值

最新推荐文章于 2023-11-28 15:35:58 发布

阅读量954

点赞数

分类专栏： kubernetes 文章标签： kubelet pleg

本文链接：https://blog.csdn.net/fengcai_ke/article/details/128740310

版权

kubernetes 专栏收录该内容

44 篇文章 9 订阅

订阅专栏

kubelet的主要作用是确保pod状态和podspec保持一致，这里的pod状态包括pod中的container状态，个数等，为了达到这个目的，kubelet需要从多个来源watch pod spec的变化，并周期从container runtime获取最新的container状态。比如创建了包括三个container的pod，当其中一个container异常退出时，kubelet能通过PLEG及时发现此事件并重建container。

PLEG(pod lifecycle event generate)是kubelet中的一个子模块，周期从container runtime获取最新的container状态，生成pod生命周期事件，
并将此事件发到channel中，kubelet从channel获取事件进行相应的处理。

PLEG工作流程图如下：
在这里插入图片描述

PodLifecycleEvent定义如下

// PodLifeCycleEventType define the event type of pod life cycle events.
type PodLifeCycleEventType string

const (
	// ContainerStarted - event type when the new state of container is running.
	ContainerStarted PodLifeCycleEventType = "ContainerStarted"
	// ContainerDied - event type when the new state of container is exited.
	ContainerDied PodLifeCycleEventType = "ContainerDied"
	// ContainerRemoved - event type when the old state of container is exited.
	ContainerRemoved PodLifeCycleEventType = "ContainerRemoved"
	// PodSync is used to trigger syncing of a pod when the observed change of
	// the state of the pod cannot be captured by any single event above.
	PodSync PodLifeCycleEventType = "PodSync"
	// ContainerChanged - event type when the new state of container is unknown.
	ContainerChanged PodLifeCycleEventType = "ContainerChanged"
)

// PodLifecycleEvent is an event that reflects the change of the pod state.
type PodLifecycleEvent struct {
	// The pod ID.
	ID types.UID
	// The type of the event.
	Type PodLifeCycleEventType
	// The accompanied data which varies based on the event type.
	//   - ContainerStarted/ContainerStopped: the container name (string).
	//   - All other event types: unused.
	Data interface{}
}

PLEG对外暴露如下几个接口

// PodLifecycleEventGenerator contains functions for generating pod life cycle events.
type PodLifecycleEventGenerator interface {
	//启动relist
	Start()
	//返回pleg的通道，用来从pleg获取事件
	Watch() chan *PodLifecycleEvent
	//用来返回单次执行relist的时间是否超过relistThreshold(三分钟)，如果超过relistThreshold说明container runtime响应太慢，或者pod太多，或者runtime真的出问题了
	Healthy() (bool, error)
}

PLEG使用GenericPLEG表示
type GenericPLEG struct {
	//relist周期，默认10s
	relistPeriod time.Duration
	//container runtime，用来获取最新的container状态
	runtime kubecontainer.Runtime
	//用来发送pleg事件的通道
	eventChannel chan *PodLifecycleEvent
	//pleg内部使用，用来保存从container runtime获取的所有pod信息
	podRecords podRecords
	//执行relist的当前时间，用来计算单次relist所耗时间
	relistTime atomic.Value
	//kubelet内部cache，用来保存从container runtime获取的PodStatus
	cache kubecontainer.Cache
	//记录本次relist时从container runtime获取状态失败的pod，下次relist时需要重新获取
	podsToReinspect map[types.UID]*kubecontainer.Pod
}

1. 创建GenericPLEG

// NewGenericPLEG instantiates a new GenericPLEG object and return it.
func NewGenericPLEG(runtime kubecontainer.Runtime, channelCapacity int,
	relistPeriod time.Duration, cache kubecontainer.Cache, clock clock.Clock) PodLifecycleEventGenerator {
	return &GenericPLEG{
		relistPeriod: relistPeriod,
		runtime:      runtime,
		eventChannel: make(chan *PodLifecycleEvent, channelCapacity),
		podRecords:   make(podRecords),
		cache:        cache,
		clock:        clock,
	}
}

2. 启动GenericPLEG

Start用来周期执行relist，周期时间为10s

// Start spawns a goroutine to relist periodically.
func (g *GenericPLEG) Start() {
	go wait.Until(g.relist, g.relistPeriod, wait.NeverStop)
}

从relist的注释也能看到出，relist的作用是从container runtime获取pod/container列表，和内部维护的pod/container列表进行比较，
生成对应的事件

// relist queries the container runtime for list of pods/containers, compare
// with the internal pods/containers, and generates events accordingly.
func (g *GenericPLEG) relist() {
	klog.V(5).InfoS("GenericPLEG: Relisting")

	if lastRelistTime := g.getRelistTime(); !lastRelistTime.IsZero() {
		metrics.PLEGRelistInterval.Observe(metrics.SinceInSeconds(lastRelistTime))
	}

	timestamp := g.clock.Now()
	defer func() {
		metrics.PLEGRelistDuration.Observe(metrics.SinceInSeconds(timestamp))
	}()

	//从container runtime获取本node上所有的pod/container列表，参数true表示获取所有的，包括已经退出或者dead的
	// Get all the pods.
	podList, err := g.runtime.GetPods(true)
	if err != nil {
		klog.ErrorS(err, "GenericPLEG: Unable to retrieve pods")
		return
	}

	//更新本次执行relist时间戳
	g.updateRelistTime(timestamp)

	pods := kubecontainer.Pods(podList)
	// update running pod and container count
	updateRunningPodAndContainerMetrics(pods)
	//将本次获取的pods保存到podRecords[pid].current
	g.podRecords.setCurrent(pods)

	//核心
	//podRecords[pid].current保存的是pod最新的状态，
	//podRecords[pid].old保存的是上次relist时pod的状态，
	//通过比较新旧pod状态，生成对应事件，保存到eventsByPodID
	// Compare the old and the current pods, and generate events.
	eventsByPodID := map[types.UID][]*PodLifecycleEvent{}
	for pid := range g.podRecords {
		oldPod := g.podRecords.getOld(pid)
		pod := g.podRecords.getCurrent(pid)
		// Get all containers in the old and the new pod.
		allContainers := getContainersFromPods(oldPod, pod)
		for _, container := range allContainers {
			events := computeEvents(oldPod, pod, &container.ID)
			for _, e := range events {
				updateEvents(eventsByPodID, e)
			}
		}
	}

	var needsReinspection map[types.UID]*kubecontainer.Pod
	if g.cacheEnabled() {
		needsReinspection = make(map[types.UID]*kubecontainer.Pod)
	}

	//遍历事件eventsByPodID，将事件发到通道上
	// If there are events associated with a pod, we should update the
	// podCache.
	for pid, events := range eventsByPodID {
		pod := g.podRecords.getCurrent(pid)
		if g.cacheEnabled() {
			//将pod状态更新到cache中，如果失败则将pod加入needsReinspection，下次relist时再次更新
			// updateCache() will inspect the pod and update the cache. If an
			// error occurs during the inspection, we want PLEG to retry again
			// in the next relist. To achieve this, we do not update the
			// associated podRecord of the pod, so that the change will be
			// detect again in the next relist.
			// TODO: If many pods changed during the same relist period,
			// inspecting the pod and getting the PodStatus to update the cache
			// serially may take a while. We should be aware of this and
			// parallelize if needed.
			if err := g.updateCache(pod, pid); err != nil {
				// Rely on updateCache calling GetPodStatus to log the actual error.
				klog.V(4).ErrorS(err, "PLEG: Ignoring events for pod", "pod", klog.KRef(pod.Namespace, pod.Name))

				// make sure we try to reinspect the pod during the next relisting
				needsReinspection[pid] = pod

				continue
			} else {
				// this pod was in the list to reinspect and we did so because it had events, so remove it
				// from the list (we don't want the reinspection code below to inspect it a second time in
				// this relist execution)
				delete(g.podsToReinspect, pid)
			}
		}
		//将本次获取的pods更新到podRecords[pid].old
		// Update the internal storage and send out the events.
		g.podRecords.update(pid)

		// Map from containerId to exit code; used as a temporary cache for lookup
		containerExitCode := make(map[string]int)

		for i := range events {
			//不关心ContainerChanged事件，此事件说明还不稳定，没必要发送，即使发送了可能还好很快产生其他事件，
			//影响效率，而且其他模块对此事件也不感兴趣
			// Filter out events that are not reliable and no other components use yet.
			if events[i].Type == ContainerChanged {
				continue
			}
			select {
			//发送事件到通道上，kubelet会从通道获取事件进行处理
			case g.eventChannel <- events[i]:
			default:
				metrics.PLEGDiscardEvents.Inc()
				klog.ErrorS(nil, "Event channel is full, discard this relist() cycle event")
			}
			...
		}
	}

	if g.cacheEnabled() {
		// reinspect any pods that failed inspection during the previous relist
		if len(g.podsToReinspect) > 0 {
			klog.V(5).InfoS("GenericPLEG: Reinspecting pods that previously failed inspection")
			for pid, pod := range g.podsToReinspect {
				if err := g.updateCache(pod, pid); err != nil {
					// Rely on updateCache calling GetPodStatus to log the actual error.
					klog.V(5).ErrorS(err, "PLEG: pod failed reinspection", "pod", klog.KRef(pod.Namespace, pod.Name))
					needsReinspection[pid] = pod
				}
			}
		}

		// Update the cache timestamp.  This needs to happen *after*
		// all pods have been properly updated in the cache.
		g.cache.UpdateTime(timestamp)
	}

	// make sure we retain the list of pods that need reinspecting the next time relist is called
	g.podsToReinspect = needsReinspection
}

computeEvents比较pod中container的新旧状态生成事件

func computeEvents(oldPod, newPod *kubecontainer.Pod, cid *kubecontainer.ContainerID) []*PodLifecycleEvent {
	var pid types.UID
	if oldPod != nil {
		pid = oldPod.ID
	} else if newPod != nil {
		pid = newPod.ID
	}
	//获取container旧状态
	oldState := getContainerState(oldPod, cid)
	//获取container新状态
	newState := getContainerState(newPod, cid)
	//根据新旧状态生成事件
	return generateEvents(pid, cid.ID, oldState, newState)
}

func generateEvents(podID types.UID, cid string, oldState, newState plegContainerState) []*PodLifecycleEvent {
	//如果新旧状态相同，说明此container没发送变化，则不生成事件
	if newState == oldState {
		return nil
	}

	klog.V(4).InfoS("GenericPLEG", "podUID", podID, "containerID", cid, "oldState", oldState, "newState", newState)
	switch newState {
	case plegContainerRunning:
		return []*PodLifecycleEvent{{ID: podID, Type: ContainerStarted, Data: cid}}
	case plegContainerExited:
		return []*PodLifecycleEvent{{ID: podID, Type: ContainerDied, Data: cid}}
	case plegContainerUnknown:
		return []*PodLifecycleEvent{{ID: podID, Type: ContainerChanged, Data: cid}}
	case plegContainerNonExistent:
		switch oldState {
		case plegContainerExited:
			// We already reported that the container died before.
			return []*PodLifecycleEvent{{ID: podID, Type: ContainerRemoved, Data: cid}}
		default:
			return []*PodLifecycleEvent{{ID: podID, Type: ContainerDied, Data: cid}, {ID: podID, Type: ContainerRemoved, Data: cid}}
		}
	default:
		panic(fmt.Sprintf("unrecognized container state: %v", newState))
	}
}

3. kubelet会监听PLEG的通道获取pod的事件

syncLoop为kubelet的主循环

func (kl *Kubelet) syncLoop(updates <-chan kubetypes.PodUpdate, handler SyncHandler)
	//获取pleg的通道
	plegCh := kl.pleg.Watch()
	const (
		base   = 100 * time.Millisecond
		max    = 5 * time.Second
		factor = 2
	)
	duration := base
	...
	for {
		//3.1 如果runtime健康检查异常后，需要sleep一段时间，不执行pod同步操作
		//sleep的时间最少100毫秒，每次乘2，最大值5s
		if err := kl.runtimeState.runtimeErrors(); err != nil {
			klog.ErrorS(err, "Skipping pod synchronization")
			// exponential backoff
			time.Sleep(duration)
			duration = time.Duration(math.Min(float64(max), factor*float64(duration)))
			continue
		}
		// reset backoff if we have a success
		duration = base

		kl.syncLoopMonitor.Store(kl.clock.Now())
		//3.2 从各种channel获取时间
		if !kl.syncLoopIteration(updates, handler, syncTicker.C, housekeepingTicker.C, plegCh) {
			break
		}
		kl.syncLoopMonitor.Store(kl.clock.Now())
	}

3.1 runtimeErrors用来调用健康检查查看runtime状态是否正常

NewMainKubelet(...)
	klet.runtimeState = newRuntimeState(maxWaitForContainerRuntime)
	//注册PLEG的健康检查函数Healthy
	klet.runtimeState.addHealthCheck("PLEG", klet.pleg.Healthy)

// Healthy check if PLEG work properly.
// relistThreshold is the maximum interval between two relist.
func (g *GenericPLEG) Healthy() (bool, error) {
	//获取执行relist时的时间戳
	relistTime := g.getRelistTime()
	//如果为0说明还未开始relist，返回false
	if relistTime.IsZero() {
		return false, fmt.Errorf("pleg has yet to be successful")
	}
	// Expose as metric so you can alert on `time()-pleg_last_seen_seconds > nn`
	metrics.PLEGLastSeen.Set(float64(relistTime.Unix()))
	//计算超过了多久
	elapsed := g.clock.Since(relistTime)
	//如果超过了三分钟，则返回false
	if elapsed > relistThreshold {
		return false, fmt.Errorf("pleg was last seen active %v ago; threshold is %v", elapsed, relistThreshold)
	}
	return true, nil
}

func (s *runtimeState) runtimeErrors() error {
	s.RLock()
	defer s.RUnlock()
	errs := []error{}
	if s.lastBaseRuntimeSync.IsZero() {
		errs = append(errs, errors.New("container runtime status check may not have completed yet"))
	} else if !s.lastBaseRuntimeSync.Add(s.baseRuntimeSyncThreshold).After(time.Now()) {
		errs = append(errs, errors.New("container runtime is down"))
	}
	//遍历healthChecks，目前只有PLEG注册的Healthy
	for _, hc := range s.healthChecks {
		//执行 Healthy，如果返回false说明runtime还未准备好，或者不健康
		if ok, err := hc.fn(); !ok {
			errs = append(errs, fmt.Errorf("%s is not healthy: %v", hc.name, err))
		}
	}
	if s.runtimeError != nil {
		errs = append(errs, s.runtimeError)
	}

	return utilerrors.NewAggregate(errs)
}

3.2 syncLoopIteration用来从channle获取事件执行
syncLoopIteration监听多种channel，这里只看pleg的channel

func (kl *Kubelet) syncLoopIteration(configCh <-chan kubetypes.PodUpdate, handler SyncHandler,
	syncCh <-chan time.Time, housekeepingCh <-chan time.Time, plegCh <-chan *pleg.PodLifecycleEvent) bool {
	select {
	case u, open := <-configCh:
	...
	case e := <-plegCh:
		if e.Type == pleg.ContainerStarted {
			// record the most recent time we observed a container start for this pod.
			// this lets us selectively invalidate the runtimeCache when processing a delete for this pod
			// to make sure we don't miss handling graceful termination for containers we reported as having started.
			kl.lastContainerStartedTime.Add(e.ID, time.Now())
		}
		//只要事件类型不是ContainerRemoved，则执行HandlePodSyncs进行pod同步
		if isSyncPodWorthy(e) {
			// PLEG event for a pod; sync it.
			if pod, ok := kl.podManager.GetPodByUID(e.ID); ok {
				klog.V(2).InfoS("SyncLoop (PLEG): event for pod", "pod", klog.KObj(pod), "event", e)
				handler.HandlePodSyncs([]*v1.Pod{pod})
			} else {
				// If the pod no longer exists, ignore the event.
				klog.V(4).InfoS("SyncLoop (PLEG): pod does not exist, ignore irrelevant event", "event", e)
			}
		}
		//事件类型为ContainerDied，则调用runtime接口清除此container信息
		if e.Type == pleg.ContainerDied {
			if containerID, ok := e.Data.(string); ok {
				kl.cleanUpContainersInPod(e.ID, containerID)
			}
		}
	case <-syncCh:
	...
	}

后续流程的调用链如下，也可参考上面的流程图，就不再贴代码了
HandlePodSyncs -> dispatchWork -> podWorkers.UpdatePod -> managePodLoop -> syncPod

参考：https://github.com/kubernetes/design-proposals-archive/blob/main/node/pod-lifecycle-event-generator.md

分享放大价值

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
kubelet PLEG实现分析

kubelet的主要作用是确保pod状态和podspec保持一致，这里的pod状态包括pod中的container状态，个数等，为了达到这个目的，kubelet需要从多个来源watch pod spec的变化，并周期从container runtime获取最新的container状态。比如创建了包括三个container的pod，当其中一个container异常退出时，kubelet能通过PLEG及时发现此事件并重建container。
复制链接

扫一扫