kubelet的主要作用是确保pod状态和podspec保持一致,这里的pod状态包括pod中的container状态,个数等,为了达到这个目的,kubelet需要从多个来源watch pod spec的变化,并周期从container runtime获取最新的container状态。比如创建了包括三个container的pod,当其中一个container异常退出时,kubelet能通过PLEG及时发现此事件并重建container。
PLEG(pod lifecycle event generate)是kubelet中的一个子模块,周期从container runtime获取最新的container状态,生成pod生命周期事件,
并将此事件发到channel中,kubelet从channel获取事件进行相应的处理。
PLEG工作流程图如下:
PodLifecycleEvent定义如下
// PodLifeCycleEventType define the event type of pod life cycle events.
type PodLifeCycleEventType string
const (
// ContainerStarted - event type when the new state of container is running.
ContainerStarted PodLifeCycleEventType = "ContainerStarted"
// ContainerDied - event type when the new state of container is exited.
ContainerDied PodLifeCycleEventType = "ContainerDied"
// ContainerRemoved - event type when the old state of container is exited.
ContainerRemoved PodLifeCycleEventType = "ContainerRemoved"
// PodSync is used to trigger syncing of a pod when the observed change of
// the state of the pod cannot be captured by any single event above.
PodSync PodLifeCycleEventType = "PodSync"
// ContainerChanged - event type when the new state of container is unknown.
ContainerChanged PodLifeCycleEventType = "ContainerChanged"
)
// PodLifecycleEvent is an event that reflects the change of the pod state.
type PodLifecycleEvent struct {
// The pod ID.
ID types.UID
// The type of the event.
Type PodLifeCycleEventType
// The accompanied data which varies based on the event type.
// - ContainerStarted/ContainerStopped: the container name (string).
// - All other event types: unused.
Data interface{}
}
PLEG对外暴露如下几个接口
// PodLifecycleEventGenerator contains functions for generating pod life cycle events.
type PodLifecycleEventGenerator interface {
//启动relist
Start()
//返回pleg的通道,用来从pleg获取事件
Watch() chan *PodLifecycleEvent
//用来返回单次执行relist的时间是否超过relistThreshold(三分钟),如果超过relistThreshold说明container runtime响应太慢,或者pod太多,或者runtime真的出问题了
Healthy() (bool, error)
}
PLEG使用GenericPLEG表示
type GenericPLEG struct {
//relist周期,默认10s
relistPeriod time.Duration
//container runtime,用来获取最新的container状态
runtime kubecontainer.Runtime
//用来发送pleg事件的通道
eventChannel chan *PodLifecycleEvent
//pleg内部使用,用来保存从container runtime获取的所有pod信息
podRecords podRecords
//执行relist的当前时间,用来计算单次relist所耗时间
relistTime atomic.Value
//kubelet内部cache,用来保存从container runtime获取的PodStatus
cache kubecontainer.Cache
//记录本次relist时从container runtime获取状态失败的pod,下次relist时需要重新获取
podsToReinspect map[types.UID]*kubecontainer.Pod
}
1. 创建GenericPLEG
// NewGenericPLEG instantiates a new GenericPLEG object and return it.
func NewGenericPLEG(runtime kubecontainer.Runtime, channelCapacity int,
relistPeriod time.Duration, cache kubecontainer.Cache, clock clock.Clock) PodLifecycleEventGenerator {
return &GenericPLEG{
relistPeriod: relistPeriod,
runtime: runtime,
eventChannel: make(chan *PodLifecycleEvent, channelCapacity),
podRecords: make(podRecords),
cache: cache,
clock: clock,
}
}
2. 启动GenericPLEG
Start用来周期执行relist,周期时间为10s
// Start spawns a goroutine to relist periodically.
func (g *GenericPLEG) Start() {
go wait.Until(g.relist, g.relistPeriod, wait.NeverStop)
}
从relist的注释也能看到出,relist的作用是从container runtime获取pod/container列表,和内部维护的pod/container列表进行比较,
生成对应的事件
// relist queries the container runtime for list of pods/containers, compare
// with the internal pods/containers, and generates events accordingly.
func (g *GenericPLEG) relist() {
klog.V(5).InfoS("GenericPLEG: Relisting")
if lastRelistTime := g.getRelistTime(); !lastRelistTime.IsZero() {
metrics.PLEGRelistInterval.Observe(metrics.SinceInSeconds(lastRelistTime))
}
timestamp := g.clock.Now()
defer func() {
metrics.PLEGRelistDuration.Observe(metrics.SinceInSeconds(timestamp))
}()
//从container runtime获取本node上所有的pod/container列表,参数true表示获取所有的,包括已经退出或者dead的
// Get all the pods.
podList, err := g.runtime.GetPods(true)
if err != nil {
klog.ErrorS(err, "GenericPLEG: Unable to retrieve pods")
return
}
//更新本次执行relist时间戳
g.updateRelistTime(timestamp)
pods := kubecontainer.Pods(podList)
// update running pod and container count
updateRunningPodAndContainerMetrics(pods)
//将本次获取的pods保存到podRecords[pid].current
g.podRecords.setCurrent(pods)
//核心
//podRecords[pid].current保存的是pod最新的状态,
//podRecords[pid].old保存的是上次relist时pod的状态,
//通过比较新旧pod状态,生成对应事件,保存到eventsByPodID
// Compare the old and the current pods, and generate events.
eventsByPodID := map[types.UID][]*PodLifecycleEvent{}
for pid := range g.podRecords {
oldPod := g.podRecords.getOld(pid)
pod := g.podRecords.getCurrent(pid)
// Get all containers in the old and the new pod.
allContainers := getContainersFromPods(oldPod, pod)
for _, container := range allContainers {
events := computeEvents(oldPod, pod, &container.ID)
for _, e := range events {
updateEvents(eventsByPodID, e)
}
}
}
var needsReinspection map[types.UID]*kubecontainer.Pod
if g.cacheEnabled() {
needsReinspection = make(map[types.UID]*kubecontainer.Pod)
}
//遍历事件eventsByPodID,将事件发到通道上
// If there are events associated with a pod, we should update the
// podCache.
for pid, events := range eventsByPodID {
pod := g.podRecords.getCurrent(pid)
if g.cacheEnabled() {
//将pod状态更新到cache中,如果失败则将pod加入needsReinspection,下次relist时再次更新
// updateCache() will inspect the pod and update the cache. If an
// error occurs during the inspection, we want PLEG to retry again
// in the next relist. To achieve this, we do not update the
// associated podRecord of the pod, so that the change will be
// detect again in the next relist.
// TODO: If many pods changed during the same relist period,
// inspecting the pod and getting the PodStatus to update the cache
// serially may take a while. We should be aware of this and
// parallelize if needed.
if err := g.updateCache(pod, pid); err != nil {
// Rely on updateCache calling GetPodStatus to log the actual error.
klog.V(4).ErrorS(err, "PLEG: Ignoring events for pod", "pod", klog.KRef(pod.Namespace, pod.Name))
// make sure we try to reinspect the pod during the next relisting
needsReinspection[pid] = pod
continue
} else {
// this pod was in the list to reinspect and we did so because it had events, so remove it
// from the list (we don't want the reinspection code below to inspect it a second time in
// this relist execution)
delete(g.podsToReinspect, pid)
}
}
//将本次获取的pods更新到podRecords[pid].old
// Update the internal storage and send out the events.
g.podRecords.update(pid)
// Map from containerId to exit code; used as a temporary cache for lookup
containerExitCode := make(map[string]int)
for i := range events {
//不关心ContainerChanged事件,此事件说明还不稳定,没必要发送,即使发送了可能还好很快产生其他事件,
//影响效率,而且其他模块对此事件也不感兴趣
// Filter out events that are not reliable and no other components use yet.
if events[i].Type == ContainerChanged {
continue
}
select {
//发送事件到通道上,kubelet会从通道获取事件进行处理
case g.eventChannel <- events[i]:
default:
metrics.PLEGDiscardEvents.Inc()
klog.ErrorS(nil, "Event channel is full, discard this relist() cycle event")
}
...
}
}
if g.cacheEnabled() {
// reinspect any pods that failed inspection during the previous relist
if len(g.podsToReinspect) > 0 {
klog.V(5).InfoS("GenericPLEG: Reinspecting pods that previously failed inspection")
for pid, pod := range g.podsToReinspect {
if err := g.updateCache(pod, pid); err != nil {
// Rely on updateCache calling GetPodStatus to log the actual error.
klog.V(5).ErrorS(err, "PLEG: pod failed reinspection", "pod", klog.KRef(pod.Namespace, pod.Name))
needsReinspection[pid] = pod
}
}
}
// Update the cache timestamp. This needs to happen *after*
// all pods have been properly updated in the cache.
g.cache.UpdateTime(timestamp)
}
// make sure we retain the list of pods that need reinspecting the next time relist is called
g.podsToReinspect = needsReinspection
}
computeEvents比较pod中container的新旧状态生成事件
func computeEvents(oldPod, newPod *kubecontainer.Pod, cid *kubecontainer.ContainerID) []*PodLifecycleEvent {
var pid types.UID
if oldPod != nil {
pid = oldPod.ID
} else if newPod != nil {
pid = newPod.ID
}
//获取container旧状态
oldState := getContainerState(oldPod, cid)
//获取container新状态
newState := getContainerState(newPod, cid)
//根据新旧状态生成事件
return generateEvents(pid, cid.ID, oldState, newState)
}
func generateEvents(podID types.UID, cid string, oldState, newState plegContainerState) []*PodLifecycleEvent {
//如果新旧状态相同,说明此container没发送变化,则不生成事件
if newState == oldState {
return nil
}
klog.V(4).InfoS("GenericPLEG", "podUID", podID, "containerID", cid, "oldState", oldState, "newState", newState)
switch newState {
case plegContainerRunning:
return []*PodLifecycleEvent{{ID: podID, Type: ContainerStarted, Data: cid}}
case plegContainerExited:
return []*PodLifecycleEvent{{ID: podID, Type: ContainerDied, Data: cid}}
case plegContainerUnknown:
return []*PodLifecycleEvent{{ID: podID, Type: ContainerChanged, Data: cid}}
case plegContainerNonExistent:
switch oldState {
case plegContainerExited:
// We already reported that the container died before.
return []*PodLifecycleEvent{{ID: podID, Type: ContainerRemoved, Data: cid}}
default:
return []*PodLifecycleEvent{{ID: podID, Type: ContainerDied, Data: cid}, {ID: podID, Type: ContainerRemoved, Data: cid}}
}
default:
panic(fmt.Sprintf("unrecognized container state: %v", newState))
}
}
3. kubelet会监听PLEG的通道获取pod的事件
syncLoop为kubelet的主循环
func (kl *Kubelet) syncLoop(updates <-chan kubetypes.PodUpdate, handler SyncHandler)
//获取pleg的通道
plegCh := kl.pleg.Watch()
const (
base = 100 * time.Millisecond
max = 5 * time.Second
factor = 2
)
duration := base
...
for {
//3.1 如果runtime健康检查异常后,需要sleep一段时间,不执行pod同步操作
//sleep的时间最少100毫秒,每次乘2,最大值5s
if err := kl.runtimeState.runtimeErrors(); err != nil {
klog.ErrorS(err, "Skipping pod synchronization")
// exponential backoff
time.Sleep(duration)
duration = time.Duration(math.Min(float64(max), factor*float64(duration)))
continue
}
// reset backoff if we have a success
duration = base
kl.syncLoopMonitor.Store(kl.clock.Now())
//3.2 从各种channel获取时间
if !kl.syncLoopIteration(updates, handler, syncTicker.C, housekeepingTicker.C, plegCh) {
break
}
kl.syncLoopMonitor.Store(kl.clock.Now())
}
3.1 runtimeErrors用来调用健康检查查看runtime状态是否正常
NewMainKubelet(...)
klet.runtimeState = newRuntimeState(maxWaitForContainerRuntime)
//注册PLEG的健康检查函数Healthy
klet.runtimeState.addHealthCheck("PLEG", klet.pleg.Healthy)
// Healthy check if PLEG work properly.
// relistThreshold is the maximum interval between two relist.
func (g *GenericPLEG) Healthy() (bool, error) {
//获取执行relist时的时间戳
relistTime := g.getRelistTime()
//如果为0说明还未开始relist,返回false
if relistTime.IsZero() {
return false, fmt.Errorf("pleg has yet to be successful")
}
// Expose as metric so you can alert on `time()-pleg_last_seen_seconds > nn`
metrics.PLEGLastSeen.Set(float64(relistTime.Unix()))
//计算超过了多久
elapsed := g.clock.Since(relistTime)
//如果超过了三分钟,则返回false
if elapsed > relistThreshold {
return false, fmt.Errorf("pleg was last seen active %v ago; threshold is %v", elapsed, relistThreshold)
}
return true, nil
}
func (s *runtimeState) runtimeErrors() error {
s.RLock()
defer s.RUnlock()
errs := []error{}
if s.lastBaseRuntimeSync.IsZero() {
errs = append(errs, errors.New("container runtime status check may not have completed yet"))
} else if !s.lastBaseRuntimeSync.Add(s.baseRuntimeSyncThreshold).After(time.Now()) {
errs = append(errs, errors.New("container runtime is down"))
}
//遍历healthChecks,目前只有PLEG注册的Healthy
for _, hc := range s.healthChecks {
//执行 Healthy,如果返回false说明runtime还未准备好,或者不健康
if ok, err := hc.fn(); !ok {
errs = append(errs, fmt.Errorf("%s is not healthy: %v", hc.name, err))
}
}
if s.runtimeError != nil {
errs = append(errs, s.runtimeError)
}
return utilerrors.NewAggregate(errs)
}
3.2 syncLoopIteration用来从channle获取事件执行
syncLoopIteration监听多种channel,这里只看pleg的channel
func (kl *Kubelet) syncLoopIteration(configCh <-chan kubetypes.PodUpdate, handler SyncHandler,
syncCh <-chan time.Time, housekeepingCh <-chan time.Time, plegCh <-chan *pleg.PodLifecycleEvent) bool {
select {
case u, open := <-configCh:
...
case e := <-plegCh:
if e.Type == pleg.ContainerStarted {
// record the most recent time we observed a container start for this pod.
// this lets us selectively invalidate the runtimeCache when processing a delete for this pod
// to make sure we don't miss handling graceful termination for containers we reported as having started.
kl.lastContainerStartedTime.Add(e.ID, time.Now())
}
//只要事件类型不是ContainerRemoved,则执行HandlePodSyncs进行pod同步
if isSyncPodWorthy(e) {
// PLEG event for a pod; sync it.
if pod, ok := kl.podManager.GetPodByUID(e.ID); ok {
klog.V(2).InfoS("SyncLoop (PLEG): event for pod", "pod", klog.KObj(pod), "event", e)
handler.HandlePodSyncs([]*v1.Pod{pod})
} else {
// If the pod no longer exists, ignore the event.
klog.V(4).InfoS("SyncLoop (PLEG): pod does not exist, ignore irrelevant event", "event", e)
}
}
//事件类型为ContainerDied,则调用runtime接口清除此container信息
if e.Type == pleg.ContainerDied {
if containerID, ok := e.Data.(string); ok {
kl.cleanUpContainersInPod(e.ID, containerID)
}
}
case <-syncCh:
...
}
后续流程的调用链如下,也可参考上面的流程图,就不再贴代码了
HandlePodSyncs -> dispatchWork -> podWorkers.UpdatePod -> managePodLoop -> syncPod
参考:https://github.com/kubernetes/design-proposals-archive/blob/main/node/pod-lifecycle-event-generator.md