整体结构
kubelet 调用下层容器运行时的执行过程,并不会直接调用Docker 的 API,而是通过一组叫作 CRI(Container Runtime Interface,容器运行时接口)的 gRPC 接口来间接执行的。Kubernetes 项目之所以要在 kubelet 中引入这样一层单独的抽象,当然是为了对 Kubernetes 屏蔽下层容器运行时的差异。
CRI工作流程:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DdNBVuR5-1570677301993)(./images/cri_shim.png)]
在node节点创建pod的整个流程如下图:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-YWc85HY2-1570677301994)(/Users/baron/Desktop/md文档/images/kubelet_object.png)]
详细流程
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-mdKGOm4S-1570677301994)(/Users/baron/Desktop/md文档/images/kubelet_create_pod_sequence.png)]
kubelet启动
-
kubelet启动入口,咔咔咔一顿初始化操作,最后的逻辑监听pod动态变化
// Run starts the kubelet reacting to config updates func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) { if kl.logServer == nil { kl.logServer = http.StripPrefix("/logs/", http.FileServer(http.Dir("/var/log/"))) } if kl.kubeClient == nil { klog.Warning("No api server defined - no node status update will be sent.") } // Start the cloud provider sync manager if kl.cloudResourceSyncManager != nil { go kl.cloudResourceSyncManager.Run(wait.NeverStop) } if err := kl.initializeModules(); err != nil { kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.KubeletSetupFailed, err.Error()) klog.Fatal(err) } // Start volume manager go kl.volumeManager.Run(kl.sourcesReady, wait.NeverStop) if kl.kubeClient != nil { // Start syncing node status immediately, this may set up things the runtime needs to run. go wait.Until(kl.syncNodeStatus, kl.nodeStatusUpdateFrequency, wait.NeverStop) go kl.fastStatusUpdateOnce() // start syncing lease if utilfeature.DefaultFeatureGate.Enabled(features.NodeLease) { go kl.nodeLeaseController.Run(wait.NeverStop) } } go wait.Until(kl.updateRuntimeUp, 5*time.Second, wait.NeverStop) // Start loop to sync iptables util rules if kl.makeIPTablesUtilChains { go wait.Until(kl.syncNetworkUtil, 1*time.Minute, wait.NeverStop) } // Start a goroutine responsible for killing pods (that are not properly // handled by pod workers). go wait.Until(kl.podKiller, 1*time.Second, wait.NeverStop) // Start component sync loops. kl.statusManager.Start() kl.probeManager.Start() // Start syncing RuntimeClasses if enabled. if kl.runtimeClassManager != nil { kl.runtimeClassManager.Start(wait.NeverStop) } // Start the pod lifecycle event generator. kl.pleg.Start() kl.syncLoop(updates, kl) // 创建pod代码入口 }
-
定时同步node状态
func (kl *Kubelet) syncLoop(updates <-chan kubetypes.PodUpdate, handler SyncHandler) { klog.Info("Starting kubelet main sync loop.") // The syncTicker wakes up kubelet to checks if there are any pod workers // that need to be sync'd. A one-second period is sufficient because the // sync interval is defaulted to 10s. syncTicker := time.NewTicker(time.Second) defer syncTicker.Stop() housekeepingTicker := time.NewTicker(housekeepingPeriod) defer housekeepingTicker.Stop() plegCh := kl.pleg.Watch() const ( base = 100 * time.Millisecond max = 5 * time.Second factor = 2 ) duration := base for { if err := kl.runtimeState.runtimeErrors(); err != nil { klog.Infof("skipping pod synchronization - %v", err) // exponential backoff time.Sleep(duration) duration = time.Duration(math.Min(float64(max), factor*float64(duration))) continue } // reset backoff if we have a success duration = base kl.syncLoopMonitor.Store(kl.clock.Now()) // 处理各种动态更新的程序入口 if !kl.syncLoopIteration(updates, handler, syncTicker.C, housekeepingTicker.C, plegCh) { break } kl.syncLoopMonitor.Store(kl.clock.Now()) } }
-
syncLoopIteration分别处理各种情况
func (kl *Kubelet) syncLoopIteration(configCh <-chan kubetypes.PodUpdate, handler SyncHandler, syncCh <-chan time.Time, housekeepingCh <-chan time.Time, plegCh <-chan *pleg.PodLifecycleEvent) bool { select { case u, open := <-configCh: // Update from a config source; dispatch it to the right handler // callback. if !open { klog.Errorf("Update channel is closed. Exiting the sync loop.") return false } switch u.Op { // 创建pod的分支情况 case kubetypes.ADD: klog.V(2).Infof("SyncLoop (ADD, %q): %q", u.Source, format.Pods(u.Pods)) // After restarting, kubelet will get all existing pods through // ADD as if they are new pods. These pods will then go through the // admission process and *may* be rejected. This can be resolved // once we have checkpointing. handler.HandlePodAdditions(u.Pods) case kubetypes.UPDATE: klog.V(2).Infof("SyncLoop (UPDATE, %q): %q", u.Source, format.PodsWithDeletionTimestamps(u.Pods)) handler.HandlePodUpdates(u.Pods) case kubetypes.REMOVE: klog.V(2).Infof("SyncLoop (REMOVE, %q): %q", u.Source, format.Pods(u.Pods)) handler.HandlePodRemoves(u.Pods) case kubetypes.RECONCILE: klog.V(4).Infof("SyncLoop (RECONCILE, %q): %q", u.Source, format.Pods(u.Pods)) handler.HandlePodReconcile(u.Pods) case kubetypes.DELETE: klog.V(2).Infof("SyncLoop (DELETE, %q): %q", u.Source, format.Pods(u.Pods)) // DELETE is treated as a UPDATE because of graceful deletion. handler.HandlePodUpdates(u.Pods) case kubetypes.RESTORE: klog.V(2).Infof("SyncLoop (RESTORE, %q): %q", u.Source, format.Pods(u.Pods)) // These are pods restored from the checkpoint. Treat them as new // pods. handler.HandlePodAdditions(u.Pods) case kubetypes.SET: // TODO: Do we want to support this? klog.Errorf("Kubelet does not support snapshot update") } if u.Op != kubetypes.RESTORE { // If the update type is RESTORE, it means that the update is from // the pod checkpoints and may be incomplete. Do not mark the // source as ready. // Mark the source ready after receiving at least one update from the // source. Once all the sources are marked ready, various cleanup // routines will start reclaiming resources. It is important that this // takes place only after kubelet calls the update handler to process // the update to ensure the internal pod cache is up-to-date. kl.sourcesReady.AddSource(u.Source) } case e := <-plegCh: if isSyncPodWorthy(e) { // PLEG event for a pod; sync it. if pod, ok := kl.podManager.GetPodByUID(e.ID); ok { klog.V(2).Infof("SyncLoop (PLEG): %q, event: %#v", format.Pod(pod), e) handler.HandlePodSyncs([]*v1.Pod{pod}) } else { // If the pod no longer exists, ignore the event. klog.V(4).Infof("SyncLoop (PLEG): ignore irrelevant event: %#v", e) } } if e.Type == pleg.ContainerDied { if containerID, ok := e.Data.(string); ok { kl.cleanUpContainersInPod(e.ID, containerID) } } case <-syncCh: // Sync pods waiting for sync podsToSync := kl.getPodsToSync() if len(podsToSync) == 0 { break } klog.V(4).Infof("SyncLoop (SYNC): %d pods; %s", len(podsToSync), format.Pods(podsToSync)) handler.HandlePodSyncs(podsToSync) case update := <-kl.livenessManager.Updates(): if update.Result == proberesults.Failure { // The liveness manager detected a failure; sync the pod. // We should not use the pod from livenessManager, because it is never updated after // initialization. pod, ok := kl.podManager.GetPodByUID(update.PodUID) if !ok { // If the pod no longer exists, ignore the update. klog.V(4).Infof("SyncLoop (container unhealthy): ignore irrelevant update: %#v", update) break } klog.V(1).Infof("SyncLoop (container unhealthy): %q", format.Pod(pod)) handler.HandlePodSyncs([]*v1.Pod{pod}) } case <-housekeepingCh: if !kl.sourcesReady.AllReady() { // If the sources aren't ready or volume manager has not yet synced the states, // skip housekeeping, as we may accidentally delete pods from unready sources. klog.V(4).Infof("SyncLoop (housekeeping, skipped): sources aren't ready yet.") } else { klog.V(4).Infof("SyncLoop (housekeeping)") if err := handler.HandlePodCleanups(); err != nil { klog.Errorf("Failed cleaning pods: %v", err) } } } return true }
创建pod
-
HandlePodAdditions处理新建pod事件
func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { start := kl.clock.Now() sort.Sort(sliceutils.PodsByCreationTime(pods)) // Responsible for checking limits in resolv.conf // The limits do not have anything to do with individual pods if kl.dnsConfigurer != nil && kl.dnsConfigurer.ResolverConfig != "" { kl.dnsConfigurer.CheckLimitsForResolvConf() } for _, pod := range pods { existingPods := kl.podManager.GetPods() // Always add the pod to the pod manager. Kubelet relies on the pod // manager as the source of truth for the desired state. If a pod does // not exist in the pod manager, it means that it has been deleted in // the apiserver and no action (other than cleanup) is required. kl.podManager.AddPod(pod) if kubepod.IsMirrorPod(pod) { kl.handleMirrorPod(pod, start) continue } if !kl.podIsTerminated(pod) { // Only go through the admission process if the pod is not // terminated. // We failed pods that we rejected, so activePods include all admitted // pods that are alive. activePods := kl.filterOutTerminatedPods(existingPods) // Check if we can admit the pod; if not, reject it. if ok, reason, message := kl.canAdmitPod(activePods, pod); !ok { kl.rejectPod(pod, reason, message) continue } } mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod) // 具体的创建pod逻辑 kl.dispatchWork(pod, kubetypes.SyncPodCreate, mirrorPod, start) kl.probeManager.AddPod(pod) } }
-
dispatchwork
func (kl *Kubelet) dispatchWork(pod *v1.Pod, syncType kubetypes.SyncPodType, mirrorPod *v1.Pod, start time.Time) { if kl.podIsTerminated(pod) { if pod.DeletionTimestamp != nil { // If the pod is in a terminated state, there is no pod worker to // handle the work item. Check if the DeletionTimestamp has been // set, and force a status update to trigger a pod deletion request // to the apiserver. kl.statusManager.TerminatePod(pod) } return } // Run the sync in an async worker. // 逻辑跳转处 kl.podWorkers.UpdatePod(&UpdatePodOptions{ Pod: pod, MirrorPod: mirrorPod, UpdateType: syncType, OnCompleteFunc: func(err error) { if err != nil { metrics.PodWorkerDuration.WithLabelValues(syncType.String()).Observe(metrics.SinceInSeconds(start)) metrics.DeprecatedPodWorkerLatency.WithLabelValues(syncType.String()).Observe(metrics.SinceInMicroseconds(start)) } }, }) // Note the number of containers for new pods. if syncType == kubetypes.SyncPodCreate { metrics.ContainersPerPodCount.Observe(float64(len(pod.Spec.Containers))) } }
-
UpdatePod
func (p *podWorkers) UpdatePod(options *UpdatePodOptions) { pod := options.Pod uid := pod.UID var podUpdates chan UpdatePodOptions var exists bool p.podLock.Lock() defer p.podLock.Unlock() if podUpdates, exists = p.podUpdates[uid]; !exists { // We need to have a buffer here, because checkForUpdates() method that // puts an update into channel is called from the same goroutine where // the channel is consumed. However, it is guaranteed that in such case // the channel is empty, so buffer of size 1 is enough. podUpdates = make(chan UpdatePodOptions, 1) p.podUpdates[uid] = podUpdates // Creating a new pod worker either means this is a new pod, or that the // kubelet just restarted. In either case the kubelet is willing to believe // the status of the pod for the first pod worker sync. See corresponding // comment in syncPod. go func() { defer runtime.HandleCrash() // 处理pod更新逻辑 p.managePodLoop(podUpdates) }() } if !p.isWorking[pod.UID] { p.isWorking[pod.UID] = true podUpdates <- *options } else { // if a request to kill a pod is pending, we do not let anything overwrite that request. update, found := p.lastUndeliveredWorkUpdate[pod.UID] if !found || update.UpdateType != kubetypes.SyncPodKill { p.lastUndeliveredWorkUpdate[pod.UID] = *options } } }
-
managePodLoop
func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) { var lastSyncTime time.Time for update := range podUpdates { err := func() error { podUID := update.Pod.UID // This is a blocking call that would return only if the cache // has an entry for the pod that is newer than minRuntimeCache // Time. This ensures the worker doesn't start syncing until // after the cache is at least newer than the finished time of // the previous sync. status, err := p.podCache.GetNewerThan(podUID, lastSyncTime) if err != nil { // This is the legacy event thrown by manage pod loop // all other events are now dispatched from syncPodFn p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "error determining status: %v", err) return err } // syncPodFn函数负责具体的处理逻辑 err = p.syncPodFn(syncPodOptions{ mirrorPod: update.MirrorPod, pod: update.Pod, podStatus: status, killPodOptions: update.KillPodOptions, updateType: update.UpdateType, }) lastSyncTime = time.Now() return err }() // notify the call-back function if the operation succeeded or not if update.OnCompleteFunc != nil { update.OnCompleteFunc(err) } if err != nil { // IMPORTANT: we do not log errors here, the syncPodFn is responsible for logging errors klog.Errorf("Error syncing pod %s (%q), skipping: %v", update.Pod.UID, format.Pod(update.Pod), err) } p.wrapUp(update.Pod.UID, err) } }
与cri等交互
-
kubelet.syncPod
func (kl *Kubelet) syncPod(o syncPodOptions) error { // pull out the required options pod := o.pod mirrorPod := o.mirrorPod podStatus := o.podStatus updateType := o.updateType // if we want to kill a pod, do it now! if updateType == kubetypes.SyncPodKill { killPodOptions := o.killPodOptions if killPodOptions == nil || killPodOptions.PodStatusFunc == nil { return fmt.Errorf("kill pod options are required if update type is kill") } apiPodStatus := killPodOptions.PodStatusFunc(pod, podStatus) kl.statusManager.SetPodStatus(pod, apiPodStatus) // we kill the pod with the specified grace period since this is a termination if err := kl.killPod(pod, nil, podStatus, killPodOptions.PodTerminationGracePeriodSecondsOverride); err != nil { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) // there was an error killing the pod, so we return that error directly utilruntime.HandleError(err) return err } return nil } // Latency measurements for the main workflow are relative to the // first time the pod was seen by the API server. var firstSeenTime time.Time if firstSeenTimeStr, ok := pod.Annotations[kubetypes.ConfigFirstSeenAnnotationKey]; ok { firstSeenTime = kubetypes.ConvertToTimestamp(firstSeenTimeStr).Get() } // Record pod worker start latency if being created // TODO: make pod workers record their own latencies if updateType == kubetypes.SyncPodCreate { if !firstSeenTime.IsZero() { // This is the first time we are syncing the pod. Record the latency // since kubelet first saw the pod if firstSeenTime is set. metrics.PodWorkerStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime)) metrics.DeprecatedPodWorkerStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) } else { klog.V(3).Infof("First seen time not recorded for pod %q", pod.UID) } } // Generate final API pod status with pod and status manager status apiPodStatus := kl.generateAPIPodStatus(pod, podStatus) // The pod IP may be changed in generateAPIPodStatus if the pod is using host network. (See #24576) // TODO(random-liu): After writing pod spec into container labels, check whether pod is using host network, and // set pod IP to hostIP directly in runtime.GetPodStatus podStatus.IPs = make([]string, 0, len(apiPodStatus.PodIPs)) for _, ipInfo := range apiPodStatus.PodIPs { podStatus.IPs = append(podStatus.IPs, ipInfo.IP) } if len(podStatus.IPs) == 0 && len(apiPodStatus.PodIP) > 0 { podStatus.IPs = []string{apiPodStatus.PodIP} } // Record the time it takes for the pod to become running. existingStatus, ok := kl.statusManager.GetPodStatus(pod.UID) if !ok || existingStatus.Phase == v1.PodPending && apiPodStatus.Phase == v1.PodRunning && !firstSeenTime.IsZero() { metrics.PodStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime)) metrics.DeprecatedPodStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) } runnable := kl.canRunPod(pod) if !runnable.Admit { // Pod is not runnable; update the Pod and Container statuses to why. apiPodStatus.Reason = runnable.Reason apiPodStatus.Message = runnable.Message // Waiting containers are not creating. const waitingReason = "Blocked" for _, cs := range apiPodStatus.InitContainerStatuses { if cs.State.Waiting != nil { cs.State.Waiting.Reason = waitingReason } } for _, cs := range apiPodStatus.ContainerStatuses { if cs.State.Waiting != nil { cs.State.Waiting.Reason = waitingReason } } } // Update status in the status manager kl.statusManager.SetPodStatus(pod, apiPodStatus) // Kill pod if it should not be running if !runnable.Admit || pod.DeletionTimestamp != nil || apiPodStatus.Phase == v1.PodFailed { var syncErr error if err := kl.killPod(pod, nil, podStatus, nil); err != nil { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) syncErr = fmt.Errorf("error killing pod: %v", err) utilruntime.HandleError(syncErr) } else { if !runnable.Admit { // There was no error killing the pod, but the pod cannot be run. // Return an error to signal that the sync loop should back off. syncErr = fmt.Errorf("pod cannot be run: %s", runnable.Message) } } return syncErr } // If the network plugin is not ready, only start the pod if it uses the host network if err := kl.runtimeState.networkErrors(); err != nil && !kubecontainer.IsHostNetworkPod(pod) { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.NetworkNotReady, "%s: %v", NetworkNotReadyErrorMsg, err) return fmt.Errorf("%s: %v", NetworkNotReadyErrorMsg, err) } // Create Cgroups for the pod and apply resource parameters // to them if cgroups-per-qos flag is enabled. pcm := kl.containerManager.NewPodContainerManager() // If pod has already been terminated then we need not create // or update the pod's cgroup if !kl.podIsTerminated(pod) { // When the kubelet is restarted with the cgroups-per-qos // flag enabled, all the pod's running containers // should be killed intermittently and brought back up // under the qos cgroup hierarchy. // Check if this is the pod's first sync firstSync := true for _, containerStatus := range apiPodStatus.ContainerStatuses { if containerStatus.State.Running != nil { firstSync = false break } } // Don't kill containers in pod if pod's cgroups already // exists or the pod is running for the first time podKilled := false if !pcm.Exists(pod) && !firstSync { if err := kl.killPod(pod, nil, podStatus, nil); err == nil { podKilled = true } } // Create and Update pod's Cgroups // Don't create cgroups for run once pod if it was killed above // The current policy is not to restart the run once pods when // the kubelet is restarted with the new flag as run once pods are // expected to run only once and if the kubelet is restarted then // they are not expected to run again. // We don't create and apply updates to cgroup if its a run once pod and was killed above if !(podKilled && pod.Spec.RestartPolicy == v1.RestartPolicyNever) { if !pcm.Exists(pod) { if err := kl.containerManager.UpdateQOSCgroups(); err != nil { klog.V(2).Infof("Failed to update QoS cgroups while syncing pod: %v", err) } if err := pcm.EnsureExists(pod); err != nil { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToCreatePodContainer, "unable to ensure pod container exists: %v", err) return fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err) } } } } // Create Mirror Pod for Static Pod if it doesn't already exist if kubepod.IsStaticPod(pod) { podFullName := kubecontainer.GetPodFullName(pod) deleted := false if mirrorPod != nil { if mirrorPod.DeletionTimestamp != nil || !kl.podManager.IsMirrorPodOf(mirrorPod, pod) { // The mirror pod is semantically different from the static pod. Remove // it. The mirror pod will get recreated later. klog.Infof("Trying to delete pod %s %v", podFullName, mirrorPod.ObjectMeta.UID) var err error deleted, err = kl.podManager.DeleteMirrorPod(podFullName, &mirrorPod.ObjectMeta.UID) if deleted { klog.Warningf("Deleted mirror pod %q because it is outdated", format.Pod(mirrorPod)) } else if err != nil { klog.Errorf("Failed deleting mirror pod %q: %v", format.Pod(mirrorPod), err) } } } if mirrorPod == nil || deleted { node, err := kl.GetNode() if err != nil || node.DeletionTimestamp != nil { klog.V(4).Infof("No need to create a mirror pod, since node %q has been removed from the cluster", kl.nodeName) } else { klog.V(4).Infof("Creating a mirror pod for static pod %q", format.Pod(pod)) if err := kl.podManager.CreateMirrorPod(pod); err != nil { klog.Errorf("Failed creating a mirror pod for %q: %v", format.Pod(pod), err) } } } } // Make data directories for the pod if err := kl.makePodDataDirs(pod); err != nil { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToMakePodDataDirectories, "error making pod data directories: %v", err) klog.Errorf("Unable to make pod data directories for pod %q: %v", format.Pod(pod), err) return err } // Volume manager will not mount volumes for terminated pods if !kl.podIsTerminated(pod) { // Wait for volumes to attach/mount if err := kl.volumeManager.WaitForAttachAndMount(pod); err != nil { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedMountVolume, "Unable to mount volumes for pod %q: %v", format.Pod(pod), err) klog.Errorf("Unable to mount volumes for pod %q: %v; skipping pod", format.Pod(pod), err) return err } } // Fetch the pull secrets for the pod pullSecrets := kl.getPullSecretsForPod(pod) // Call the container runtime's SyncPod callback // 和创建相关的逻辑在这里,依赖上了容器运行时 result := kl.containerRuntime.SyncPod(pod, podStatus, pullSecrets, kl.backOff) kl.reasonCache.Update(pod.UID, result) if err := result.Error(); err != nil { // Do not return error if the only failures were pods in backoff for _, r := range result.SyncResults { if r.Error != kubecontainer.ErrCrashLoopBackOff && r.Error != images.ErrImagePullBackOff { // Do not record an event here, as we keep all event logging for sync pod failures // local to container runtime so we get better errors return err } } return nil } return nil }
-
containerRuntime.SyncPod
func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, backOff *flowcontrol.Backoff) (result kubecontainer.PodSyncResult) { // Step 1: Compute sandbox and container changes. podContainerChanges := m.computePodActions(pod, podStatus) klog.V(3).Infof("computePodActions got %+v for pod %q", podContainerChanges, format.Pod(pod)) if podContainerChanges.CreateSandbox { ref, err := ref.GetReference(legacyscheme.Scheme, pod) if err != nil { klog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), err) } if podContainerChanges.SandboxID != "" { m.recorder.Eventf(ref, v1.EventTypeNormal, events.SandboxChanged, "Pod sandbox changed, it will be killed and re-created.") } else { klog.V(4).Infof("SyncPod received new pod %q, will create a sandbox for it", format.Pod(pod)) } } // Step 2: Kill the pod if the sandbox has changed. if podContainerChanges.KillPod { if podContainerChanges.CreateSandbox { klog.V(4).Infof("Stopping PodSandbox for %q, will start new one", format.Pod(pod)) } else { klog.V(4).Infof("Stopping PodSandbox for %q because all other containers are dead.", format.Pod(pod)) } killResult := m.killPodWithSyncResult(pod, kubecontainer.ConvertPodStatusToRunningPod(m.runtimeName, podStatus), nil) result.AddPodSyncResult(killResult) if killResult.Error() != nil { klog.Errorf("killPodWithSyncResult failed: %v", killResult.Error()) return } if podContainerChanges.CreateSandbox { m.purgeInitContainers(pod, podStatus) } } else { // Step 3: kill any running containers in this pod which are not to keep. for containerID, containerInfo := range podContainerChanges.ContainersToKill { klog.V(3).Infof("Killing unwanted container %q(id=%q) for pod %q", containerInfo.name, containerID, format.Pod(pod)) killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, containerInfo.name) result.AddSyncResult(killContainerResult) if err := m.killContainer(pod, containerID, containerInfo.name, containerInfo.message, nil); err != nil { killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error()) klog.Errorf("killContainer %q(id=%q) for pod %q failed: %v", containerInfo.name, containerID, format.Pod(pod), err) return } } } // Keep terminated init containers fairly aggressively controlled // This is an optimization because container removals are typically handled // by container garbage collector. m.pruneInitContainersBeforeStart(pod, podStatus) // We pass the value of the PRIMARY podIP down to generatePodSandboxConfig and // generateContainerConfig, which in turn passes it to various other // functions, in order to facilitate functionality that requires this // value (hosts file and downward API) and avoid races determining // the pod IP in cases where a container requires restart but the // podIP isn't in the status manager yet. // // We default to the IPs in the passed-in pod status, and overwrite them if the // sandbox needs to be (re)started. var podIPs []string if podStatus != nil { podIPs = podStatus.IPs } // Step 4: Create a sandbox for the pod if necessary. podSandboxID := podContainerChanges.SandboxID if podContainerChanges.CreateSandbox { var msg string var err error klog.V(4).Infof("Creating sandbox for pod %q", format.Pod(pod)) createSandboxResult := kubecontainer.NewSyncResult(kubecontainer.CreatePodSandbox, format.Pod(pod)) result.AddSyncResult(createSandboxResult) // 创建网络容器pause podSandboxID, msg, err = m.createPodSandbox(pod, podContainerChanges.Attempt) if err != nil { createSandboxResult.Fail(kubecontainer.ErrCreatePodSandbox, msg) klog.Errorf("createPodSandbox for pod %q failed: %v", format.Pod(pod), err) ref, referr := ref.GetReference(legacyscheme.Scheme, pod) if referr != nil { klog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), referr) } m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedCreatePodSandBox, "Failed create pod sandbox: %v", err) return } klog.V(4).Infof("Created PodSandbox %q for pod %q", podSandboxID, format.Pod(pod)) podSandboxStatus, err := m.runtimeService.PodSandboxStatus(podSandboxID) if err != nil { ref, referr := ref.GetReference(legacyscheme.Scheme, pod) if referr != nil { klog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), referr) } m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedStatusPodSandBox, "Unable to get pod sandbox status: %v", err) klog.Errorf("Failed to get pod sandbox status: %v; Skipping pod %q", err, format.Pod(pod)) result.Fail(err) return } // If we ever allow updating a pod from non-host-network to // host-network, we may use a stale IP. if !kubecontainer.IsHostNetworkPod(pod) { // Overwrite the podIPs passed in the pod status, since we just started the pod sandbox. podIPs = m.determinePodSandboxIPs(pod.Namespace, pod.Name, podSandboxStatus) klog.V(4).Infof("Determined the ip %v for pod %q after sandbox changed", podIPs, format.Pod(pod)) } } // the start containers routines depend on pod ip(as in primary pod ip) // instead of trying to figure out if we have 0 < len(podIPs) // everytime, we short circuit it here podIP := "" if len(podIPs) != 0 { podIP = podIPs[0] } // Get podSandboxConfig for containers to start. configPodSandboxResult := kubecontainer.NewSyncResult(kubecontainer.ConfigPodSandbox, podSandboxID) result.AddSyncResult(configPodSandboxResult) podSandboxConfig, err := m.generatePodSandboxConfig(pod, podContainerChanges.Attempt) if err != nil { message := fmt.Sprintf("GeneratePodSandboxConfig for pod %q failed: %v", format.Pod(pod), err) klog.Error(message) configPodSandboxResult.Fail(kubecontainer.ErrConfigPodSandbox, message) return } // Step 5: start the init container. if container := podContainerChanges.NextInitContainerToStart; container != nil { // Start the next init container. startContainerResult := kubecontainer.NewSyncResult(kubecontainer.StartContainer, container.Name) result.AddSyncResult(startContainerResult) isInBackOff, msg, err := m.doBackOff(pod, container, podStatus, backOff) if isInBackOff { startContainerResult.Fail(err, msg) klog.V(4).Infof("Backing Off restarting init container %+v in pod %v", container, format.Pod(pod)) return } klog.V(4).Infof("Creating init container %+v in pod %v", container, format.Pod(pod)) if msg, err := m.startContainer(podSandboxID, podSandboxConfig, container, pod, podStatus, pullSecrets, podIP); err != nil { startContainerResult.Fail(err, msg) utilruntime.HandleError(fmt.Errorf("init container start failed: %v: %s", err, msg)) return } // Successfully started the container; clear the entry in the failure klog.V(4).Infof("Completed init container %q for pod %q", container.Name, format.Pod(pod)) } // Step 6: start containers in podContainerChanges.ContainersToStart. for _, idx := range podContainerChanges.ContainersToStart { container := &pod.Spec.Containers[idx] startContainerResult := kubecontainer.NewSyncResult(kubecontainer.StartContainer, container.Name) result.AddSyncResult(startContainerResult) isInBackOff, msg, err := m.doBackOff(pod, container, podStatus, backOff) if isInBackOff { startContainerResult.Fail(err, msg) klog.V(4).Infof("Backing Off restarting container %+v in pod %v", container, format.Pod(pod)) continue } klog.V(4).Infof("Creating container %+v in pod %v", container, format.Pod(pod)) // 创建业务容器 if msg, err := m.startContainer(podSandboxID, podSandboxConfig, container, pod, podStatus, pullSecrets, podIP); err != nil { startContainerResult.Fail(err, msg) // known errors that are logged in other places are logged at higher levels here to avoid // repetitive log spam switch { case err == images.ErrImagePullBackOff: klog.V(3).Infof("container start failed: %v: %s", err, msg) default: utilruntime.HandleError(fmt.Errorf("container start failed: %v: %s", err, msg)) } continue } } return }
cri的实现之一dockershim
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IrpYphvT-1570677301995)(/Users/baron/Desktop/md文档/images/dockershim_sequence.png)]
-
接口
// Runtime service defines the public APIs for remote container runtimes service RuntimeService { // Version returns the runtime name, runtime version, and runtime API version. rpc Version(VersionRequest) returns (VersionResponse) {} // RunPodSandbox creates and starts a pod-level sandbox. Runtimes must ensure // the sandbox is in the ready state on success. rpc RunPodSandbox(RunPodSandboxRequest) returns (RunPodSandboxResponse) {} // StopPodSandbox stops any running process that is part of the sandbox and // reclaims network resources (e.g., IP addresses) allocated to the sandbox. // If there are any running containers in the sandbox, they must be forcibly // terminated. // This call is idempotent, and must not return an error if all relevant // resources have already been reclaimed. kubelet will call StopPodSandbox // at least once before calling RemovePodSandbox. It will also attempt to // reclaim resources eagerly, as soon as a sandbox is not needed. Hence, // multiple StopPodSandbox calls are expected. rpc StopPodSandbox(StopPodSandboxRequest) returns (StopPodSandboxResponse) {} // RemovePodSandbox removes the sandbox. If there are any running containers // in the sandbox, they must be forcibly terminated and removed. // This call is idempotent, and must not return an error if the sandbox has // already been removed. rpc RemovePodSandbox(RemovePodSandboxRequest) returns (RemovePodSandboxResponse) {} // PodSandboxStatus returns the status of the PodSandbox. If the PodSandbox is not // present, returns an error. rpc PodSandboxStatus(PodSandboxStatusRequest) returns (PodSandboxStatusResponse) {} // ListPodSandbox returns a list of PodSandboxes. rpc ListPodSandbox(ListPodSandboxRequest) returns (ListPodSandboxResponse) {} // CreateContainer creates a new container in specified PodSandbox rpc CreateContainer(CreateContainerRequest) returns (CreateContainerResponse) {} // StartContainer starts the container. rpc StartContainer(StartContainerRequest) returns (StartContainerResponse) {} // StopContainer stops a running container with a grace period (i.e., timeout). // This call is idempotent, and must not return an error if the container has // already been stopped. // TODO: what must the runtime do after the grace period is reached? rpc StopContainer(StopContainerRequest) returns (StopContainerResponse) {} // RemoveContainer removes the container. If the container is running, the // container must be forcibly removed. // This call is idempotent, and must not return an error if the container has // already been removed. rpc RemoveContainer(RemoveContainerRequest) returns (RemoveContainerResponse) {} // ListContainers lists all containers by filters. rpc ListContainers(ListContainersRequest) returns (ListContainersResponse) {} // ContainerStatus returns status of the container. If the container is not // present, returns an error. rpc ContainerStatus(ContainerStatusRequest) returns (ContainerStatusResponse) {} // UpdateContainerResources updates ContainerConfig of the container. rpc UpdateContainerResources(UpdateContainerResourcesRequest) returns (UpdateContainerResourcesResponse) {} // ReopenContainerLog asks runtime to reopen the stdout/stderr log file // for the container. This is often called after the log file has been // rotated. If the container is not running, container runtime can choose // to either create a new log file and return nil, or return an error. // Once it returns error, new container log file MUST NOT be created. rpc ReopenContainerLog(ReopenContainerLogRequest) returns (ReopenContainerLogResponse) {} // ExecSync runs a command in a container synchronously. rpc ExecSync(ExecSyncRequest) returns (ExecSyncResponse) {} // Exec prepares a streaming endpoint to execute a command in the container. rpc Exec(ExecRequest) returns (ExecResponse) {} // Attach prepares a streaming endpoint to attach to a running container. rpc Attach(AttachRequest) returns (AttachResponse) {} // PortForward prepares a streaming endpoint to forward ports from a PodSandbox. rpc PortForward(PortForwardRequest) returns (PortForwardResponse) {} // ContainerStats returns stats of the container. If the container does not // exist, the call returns an error. rpc ContainerStats(ContainerStatsRequest) returns (ContainerStatsResponse) {} // ListContainerStats returns stats of all running containers. rpc ListContainerStats(ListContainerStatsRequest) returns (ListContainerStatsResponse) {} // UpdateRuntimeConfig updates the runtime configuration based on the given request. rpc UpdateRuntimeConfig(UpdateRuntimeConfigRequest) returns (UpdateRuntimeConfigResponse) {} // Status returns the status of the runtime. rpc Status(StatusRequest) returns (StatusResponse) {} } // ImageService defines the public APIs for managing images. service ImageService { // ListImages lists existing images. rpc ListImages(ListImagesRequest) returns (ListImagesResponse) {} // ImageStatus returns the status of the image. If the image is not // present, returns a response with ImageStatusResponse.Image set to // nil. rpc ImageStatus(ImageStatusRequest) returns (ImageStatusResponse) {} // PullImage pulls an image with authentication config. rpc PullImage(PullImageRequest) returns (PullImageResponse) {} // RemoveImage removes the image. // This call is idempotent, and must not return an error if the image has // already been removed. rpc RemoveImage(RemoveImageRequest) returns (RemoveImageResponse) {} // ImageFSInfo returns information of the filesystem that is used to store images. rpc ImageFsInfo(ImageFsInfoRequest) returns (ImageFsInfoResponse) {} }
-
服务实现
// DockerServer is the grpc server of dockershim. type DockerServer struct { // endpoint is the endpoint to serve on. endpoint string // service is the docker service which implements runtime and image services. service dockershim.CRIService // server is the grpc server. server *grpc.Server } // NewDockerServer creates the dockershim grpc server. func NewDockerServer(endpoint string, s dockershim.CRIService) *DockerServer { return &DockerServer{ endpoint: endpoint, service: s, } } // Start starts the dockershim grpc server. func (s *DockerServer) Start() error { // Start the internal service. if err := s.service.Start(); err != nil { klog.Errorf("Unable to start docker service") return err } klog.V(2).Infof("Start dockershim grpc server") l, err := util.CreateListener(s.endpoint) if err != nil { return fmt.Errorf("failed to listen on %q: %v", s.endpoint, err) } // Create the grpc server and register runtime and image services. s.server = grpc.NewServer( grpc.MaxRecvMsgSize(maxMsgSize), grpc.MaxSendMsgSize(maxMsgSize), ) runtimeapi.RegisterRuntimeServiceServer(s.server, s.service) runtimeapi.RegisterImageServiceServer(s.server, s.service) go func() { if err := s.server.Serve(l); err != nil { klog.Fatalf("Failed to serve connections: %v", err) } }() return nil }
-
客户端kubeDockerClient
type kubeDockerClient struct { // timeout is the timeout of short running docker operations. timeout time.Duration // If no pulling progress is made before imagePullProgressDeadline, the image pulling will be cancelled. // Docker reports image progress for every 512kB block, so normally there shouldn't be too long interval // between progress updates. imagePullProgressDeadline time.Duration client *dockerapi.Client }
创建容器过程使用cni插件
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XhIwXq3W-1570677301995)(/Users/baron/Desktop/md文档/images/dockershim_object.png)]
-
dockerService里的网络处理逻辑
type dockerService struct { client libdocker.Interface os kubecontainer.OSInterface podSandboxImage string streamingRuntime *streamingRuntime streamingServer streaming.Server // 网络处理逻辑入口 network *network.PluginManager // Map of podSandboxID :: network-is-ready networkReady map[string]bool networkReadyLock sync.Mutex containerManager cm.ContainerManager // cgroup driver used by Docker runtime. cgroupDriver string checkpointManager checkpointmanager.CheckpointManager // caches the version of the runtime. // To be compatible with multiple docker versions, we need to perform // version checking for some operations. Use this cache to avoid querying // the docker daemon every time we need to do such checks. versionCache *cache.ObjectCache // startLocalStreamingServer indicates whether dockershim should start a // streaming server on localhost. startLocalStreamingServer bool // containerCleanupInfos maps container IDs to the `containerCleanupInfo` structs // needed to clean up after containers have been started or removed. // (see `applyPlatformSpecificDockerConfig` and `performPlatformSpecificContainerCleanup` // methods for more info). containerCleanupInfos map[string]*containerCleanupInfo }
-
network.PluginManager
type NetworkPlugin interface { // Init initializes the plugin. This will be called exactly once // before any other methods are called. Init(host Host, hairpinMode kubeletconfig.HairpinMode, nonMasqueradeCIDR string, mtu int) error // Called on various events like: // NET_PLUGIN_EVENT_POD_CIDR_CHANGE Event(name string, details map[string]interface{}) // Name returns the plugin's name. This will be used when searching // for a plugin by name, e.g. Name() string // Returns a set of NET_PLUGIN_CAPABILITY_* Capabilities() utilsets.Int // SetUpPod is the method called after the infra container of // the pod has been created but before the other containers of the // pod are launched. SetUpPod(namespace string, name string, podSandboxID kubecontainer.ContainerID, annotations, options map[string]string) error // TearDownPod is the method called before a pod's infra container will be deleted TearDownPod(namespace string, name string, podSandboxID kubecontainer.ContainerID) error // GetPodNetworkStatus is the method called to obtain the ipv4 or ipv6 addresses of the container GetPodNetworkStatus(namespace string, name string, podSandboxID kubecontainer.ContainerID) (*PodNetworkStatus, error) // Status returns error if the network plugin is in error state Status() error }
-
cniNetworkPlugin实现了上述接口
type cniNetworkPlugin struct { network.NoopNetworkPlugin loNetwork *cniNetwork sync.RWMutex defaultNetwork *cniNetwork host network.Host execer utilexec.Interface nsenterPath string confDir string binDirs []string cacheDir string podCidr string }
-
cni接口
type CNI interface { AddNetworkList(ctx context.Context, net *NetworkConfigList, rt *RuntimeConf) (types.Result, error) CheckNetworkList(ctx context.Context, net *NetworkConfigList, rt *RuntimeConf) error DelNetworkList(ctx context.Context, net *NetworkConfigList, rt *RuntimeConf) error GetNetworkListCachedResult(net *NetworkConfigList, rt *RuntimeConf) (types.Result, error) AddNetwork(ctx context.Context, net *NetworkConfig, rt *RuntimeConf) (types.Result, error) CheckNetwork(ctx context.Context, net *NetworkConfig, rt *RuntimeConf) error DelNetwork(ctx context.Context, net *NetworkConfig, rt *RuntimeConf) error GetNetworkCachedResult(net *NetworkConfig, rt *RuntimeConf) (types.Result, error) ValidateNetworkList(ctx context.Context, net *NetworkConfigList) ([]string, error) ValidateNetwork(ctx context.Context, net *NetworkConfig) ([]string, error) }
-
实现了cni接口的插件,如flannel、calico等