【博客497】k8s cgroup原理完整剖析

最新推荐文章于 2024-09-13 16:50:19 发布

lulu的云原生笔记

最新推荐文章于 2024-09-13 16:50:19 发布

阅读量1.9k

点赞数 3

分类专栏： k8s 文章标签： kubernetes docker 容器

本文链接：https://blog.csdn.net/qq_43684922/article/details/126902667

版权

k8s 专栏收录该内容

148 篇文章 19 订阅

订阅专栏

k8s cgroup原理

k8s cgroup设计层级：

在这里插入图片描述

k8s完整的cgroup hierarchy

root
 | 
 +- kube-reserved
 |   |
 |   +- kubelet (kubelet process)
 |   | 
 |   +- runtime (docker-engine, containerd...)
 |
 +- system-reserved (systemd process: logind...)
 |
 +- kubepods
 |    |
 |    +- Pod1
 |    |   |
 |    |   +- Container11 (limit: cpu: 10m, memory: 1Gi)
 |    |   |     |
 |    |   |     +- cpu.quota: 10m
 |    |   |     +- cpu.share: 10m
 |    |   |     +- mem.limit: 1Gi
 |    |   |
 |    |   +- Container12 (limit: cpu: 100m, memory: 2Gi)
 |    |   |     |
 |    |   |     +- cpu.quota: 10m
 |    |   |     +- cpu.share: 10m
 |    |   |     +- mem.limit: 2Gi
 |    |   |
 |    |   +- cpu.quota: 110m  
 |    |   +- cpu.share: 110m
 |    |   +- mem.limit: 3Gi
 |    |
 |    +- Pod2
 |    |   +- Container21 (limit: cpu: 20m, memory: 2Gi)
 |    |   |     |
 |    |   |     +- cpu.quota: 20m
 |    |   |     +- cpu.share: 20m
 |    |   |     +- mem.limit: 2Gi
 |    |   |
 |    |   +- cpu.quota: 20m  
 |    |   +- cpu.share: 20m
 |    |   +- mem.limit: 2Gi
 |    |
 |    +- burstable
 |    |   |
 |    |   +- Pod3
 |    |   |   |
 |    |   |   +- Container31 (limit: cpu: 50m, memory: 2Gi; request: cpu: 20m, memory: 1Gi )
 |    |   |   |     |
 |    |   |   |     +- cpu.quota: 50m
 |    |   |   |     +- cpu.share: 20m
 |    |   |   |     +- mem.limit: 2Gi
 |    |   |   |
 |    |   |   +- Container32 (limit: cpu: 100m, memory: 1Gi)
 |    |   |   |     |
 |    |   |   |     +- cpu.quota: 100m
 |    |   |   |     +- cpu.share: 100m
 |    |   |   |     +- mem.limit: 1Gi
 |    |   |   |
 |    |   |   +- cpu.quota: 150m  
 |    |   |   +- cpu.share: 120m
 |    |   |   +- mem.limit: 3Gi
 |    |   |
 |    |   +- Pod4
 |    |   |   +- Container41 (limit: cpu: 20m, memory: 2Gi; request: cpu: 10m, memory: 1Gi )
 |    |   |   |     |
 |    |   |   |     +- cpu.quota: 20m
 |    |   |   |     +- cpu.share: 10m
 |    |   |   |     +- mem.limit: 2Gi
 |    |   |   |
 |    |   |   +- cpu.quota: 20m  
 |    |   |   +- cpu.share: 10m
 |    |   |   +- mem.limit: 2Gi
 |    |   |
 |    |   +- cpu.share: 130m
 |    |   +- mem.limit: $(Allocatable - 5Gi)
 |    |
 |    +- besteffort
 |    |   |
 |    |   +- Pod5
 |    |   |   |
 |    |   |   +- Container6 
 |    |   |   +- Container7
 |    |   |
 |    |   +- cpu.share: 2
 |    |   +- mem.limit: $(Allocatable - 7Gi)

在这里插入图片描述

k8s kubelet如何为pod设置CgroupParent

// generatePodSandboxConfig generates pod sandbox config from v1.Pod.
// kubelet生成sandbox pod的配置
func (m *kubeGenericRuntimeManager) generatePodSandboxConfig(pod *v1.Pod, attempt uint32) (*runtimeapi.PodSandboxConfig, error) {
    // TODO: deprecating podsandbox resource requirements in favor of the pod level cgroup
    // Refer https://github.com/kubernetes/kubernetes/issues/29871
    podUID := string(pod.UID)
    podSandboxConfig := &runtimeapi.PodSandboxConfig{
        Metadata: &runtimeapi.PodSandboxMetadata{
            Name:      pod.Name,
            Namespace: pod.Namespace,
            Uid:       podUID,
            Attempt:   attempt,
        },
        Labels:      newPodLabels(pod),
        Annotations: newPodAnnotations(pod),
    }

    dnsConfig, err := m.runtimeHelper.GetPodDNS(pod)
    if err != nil {
        return nil, err
    }
    podSandboxConfig.DnsConfig = dnsConfig

    if !kubecontainer.IsHostNetworkPod(pod) {
        // TODO: Add domain support in new runtime interface
        podHostname, podDomain, err := m.runtimeHelper.GeneratePodHostNameAndDomain(pod)
        if err != nil {
            return nil, err
        }
        podHostname, err = util.GetNodenameForKernel(podHostname, podDomain, pod.Spec.SetHostnameAsFQDN)
        if err != nil {
            return nil, err
        }
        podSandboxConfig.Hostname = podHostname
    }

    logDir := BuildPodLogsDirectory(pod.Namespace, pod.Name, pod.UID)
    podSandboxConfig.LogDirectory = logDir

    portMappings := []*runtimeapi.PortMapping{}
    for _, c := range pod.Spec.Containers {
        containerPortMappings := kubecontainer.MakePortMappings(&c)

        for idx := range containerPortMappings {
            port := containerPortMappings[idx]
            hostPort := int32(port.HostPort)
            containerPort := int32(port.ContainerPort)
            protocol := toRuntimeProtocol(port.Protocol)
            portMappings = append(portMappings, &runtimeapi.PortMapping{
                HostIp:        port.HostIP,
                HostPort:      hostPort,
                ContainerPort: containerPort,
                Protocol:      protocol,
            })
        }

    }
    if len(portMappings) > 0 {
        podSandboxConfig.PortMappings = portMappings
    }

    lc, err := m.generatePodSandboxLinuxConfig(pod)
    if err != nil {
        return nil, err
    }
    podSandboxConfig.Linux = lc

    if runtime.GOOS == "windows" {
        wc, err := m.generatePodSandboxWindowsConfig(pod)
        if err != nil {
            return nil, err
        }
        podSandboxConfig.Windows = wc
    }

    // Update config to include overhead, sandbox level resources
    if err := m.applySandboxResources(pod, podSandboxConfig); err != nil {
        return nil, err
    }
    return podSandboxConfig, nil
}

// generatePodSandboxLinuxConfig generates LinuxPodSandboxConfig from v1.Pod.
// We've to call PodSandboxLinuxConfig always irrespective of the underlying OS as securityContext is not part of
// podSandboxConfig. It is currently part of LinuxPodSandboxConfig. In future, if we have securityContext pulled out
// in podSandboxConfig we should be able to use it.
// 生成sandbox pod的linux配置
func (m *kubeGenericRuntimeManager) generatePodSandboxLinuxConfig(pod *v1.Pod) (*runtimeapi.LinuxPodSandboxConfig, error) {
    // 获取k8s层设计的cgroupParent
    cgroupParent := m.runtimeHelper.GetPodCgroupParent(pod)
    lc := &runtimeapi.LinuxPodSandboxConfig{
        CgroupParent: cgroupParent,
        SecurityContext: &runtimeapi.LinuxSandboxSecurityContext{
            Privileged: kubecontainer.HasPrivilegedContainer(pod),

            // TODO: Deprecated, remove after we switch to Seccomp field
            // Forcing sandbox to run as `runtime/default` allow users to
            // use least privileged seccomp profiles at pod level. Issue #84623
            SeccompProfilePath: v1.SeccompProfileRuntimeDefault,

            Seccomp: &runtimeapi.SecurityProfile{
                ProfileType: runtimeapi.SecurityProfile_RuntimeDefault,
            },
        },
    }

    sysctls := make(map[string]string)
    if pod.Spec.SecurityContext != nil {
        for _, c := range pod.Spec.SecurityContext.Sysctls {
            sysctls[c.Name] = c.Value
        }
    }

    lc.Sysctls = sysctls

    if pod.Spec.SecurityContext != nil {
        sc := pod.Spec.SecurityContext
        if sc.RunAsUser != nil && runtime.GOOS != "windows" {
            lc.SecurityContext.RunAsUser = &runtimeapi.Int64Value{Value: int64(*sc.RunAsUser)}
        }
        if sc.RunAsGroup != nil && runtime.GOOS != "windows" {
            lc.SecurityContext.RunAsGroup = &runtimeapi.Int64Value{Value: int64(*sc.RunAsGroup)}
        }
        namespaceOptions, err := runtimeutil.NamespacesForPod(pod, m.runtimeHelper)
        if err != nil {
            return nil, err
        }
        lc.SecurityContext.NamespaceOptions = namespaceOptions

        if sc.FSGroup != nil && runtime.GOOS != "windows" {
            lc.SecurityContext.SupplementalGroups = append(lc.SecurityContext.SupplementalGroups, int64(*sc.FSGroup))
        }
        if groups := m.runtimeHelper.GetExtraSupplementalGroupsForPod(pod); len(groups) > 0 {
            lc.SecurityContext.SupplementalGroups = append(lc.SecurityContext.SupplementalGroups, groups...)
        }
        if sc.SupplementalGroups != nil {
            for _, sg := range sc.SupplementalGroups {
                lc.SecurityContext.SupplementalGroups = append(lc.SecurityContext.SupplementalGroups, int64(sg))
            }
        }
        if sc.SELinuxOptions != nil && runtime.GOOS != "windows" {
            lc.SecurityContext.SelinuxOptions = &runtimeapi.SELinuxOption{
                User:  sc.SELinuxOptions.User,
                Role:  sc.SELinuxOptions.Role,
                Type:  sc.SELinuxOptions.Type,
                Level: sc.SELinuxOptions.Level,
            }
        }
    }

    return lc, nil
}

// GetPodCgroupParent gets pod cgroup parent from container manager.
func (kl *Kubelet) GetPodCgroupParent(pod *v1.Pod) string {
    pcm := kl.containerManager.NewPodContainerManager()
    _, cgroupParent := pcm.GetPodContainerName(pod)
    return cgroupParent
}

// 实现获取pod对应的cgroup路径，以此作为pod的容器的cgroup parent路径
// GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, string) {
    podQOS := v1qos.GetPodQOS(pod)
    // Get the parent QOS container name
    var parentContainer CgroupName
    switch podQOS {
    case v1.PodQOSGuaranteed:
        parentContainer = m.qosContainersInfo.Guaranteed
    case v1.PodQOSBurstable:
        parentContainer = m.qosContainersInfo.Burstable
    case v1.PodQOSBestEffort:
        parentContainer = m.qosContainersInfo.BestEffort
    }
    podContainer := GetPodCgroupNameSuffix(pod.UID)

    // Get the absolute path of the cgroup
    cgroupName := NewCgroupName(parentContainer, podContainer)
    // Get the literal cgroupfs name
    cgroupfsName := m.cgroupManager.Name(cgroupName)

    return cgroupName, cgroupfsName
}

pod对应的docker中的CgroupParent参数：

[root@10-231-2-19 ~]# docker ps | grep 1975312c7f88
1975312c7f88   d06f046c0907                                        "kube-apiserver --ad…"   42 hours ago   Up 42 hours             k8s_kube-apiserver_kube-apiserver-10-231-2-19_kube-system_0f47b891f85873ffd8e01cd9203c2700_2

[root@10-231-2-19 ~]# docker inspect 1975312c7f88 | grep Parent
            "CgroupParent": "/kubepods/burstable/pod0f47b891f85873ffd8e01cd9203c2700",

kubelet如何实现为pod分配cpu和memory

kubelet会将pod申请的cpu和memory作为容器的启动参数，以使得contrainer runtime能够为容器设置相应的cgroup策略，并且使用kubelet传过来的cgroup parent
作为容器的cgroup的parent前缀，实现容器归属到其pod的cgroup的管理

kubelet将pod申请的cpu和memory作为容器的启动参数

// startContainer starts a container and returns a message indicates why it is failed on error.
// It starts the container through the following steps:
// * pull the image
// * create the container
// * start the container
// * run the post start lifecycle hooks (if applicable)
// 启动容器
func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandboxConfig *runtimeapi.PodSandboxConfig, spec *startSpec, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, podIP string, podIPs []string) (string, error) {
    container := spec.container

    // Step 1: pull the image.
    imageRef, msg, err := m.imagePuller.EnsureImageExists(pod, container, pullSecrets, podSandboxConfig)
    if err != nil {
        s, _ := grpcstatus.FromError(err)
        m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
        return msg, err
    }

    // Step 2: create the container.
    // For a new container, the RestartCount should be 0
    restartCount := 0
    containerStatus := podStatus.FindContainerStatusByName(container.Name)
    if containerStatus != nil {
        restartCount = containerStatus.RestartCount + 1
    } else {
        // The container runtime keeps state on container statuses and
        // what the container restart count is. When nodes are rebooted
        // some container runtimes clear their state which causes the
        // restartCount to be reset to 0. This causes the logfile to
        // start at 0.log, which either overwrites or appends to the
        // already existing log.
        //
        // We are checking to see if the log directory exists, and find
        // the latest restartCount by checking the log name -
        // {restartCount}.log - and adding 1 to it.
        logDir := BuildContainerLogsDirectory(pod.Namespace, pod.Name, pod.UID, container.Name)
        restartCount, err = calcRestartCountByLogDir(logDir)
        if err != nil {
            klog.InfoS("Log directory exists but could not calculate restartCount", "logDir", logDir, "err", err)
        }
    }

    target, err := spec.getTargetID(podStatus)
    if err != nil {
        s, _ := grpcstatus.FromError(err)
        m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
        return s.Message(), ErrCreateContainerConfig
    }
    // 生成容器配置，里面包括了容器的cpu，memory限制参数
    containerConfig, cleanupAction, err := m.generateContainerConfig(container, pod, restartCount, podIP, imageRef, podIPs, target)
    if cleanupAction != nil {
        defer cleanupAction()
    }
    if err != nil {
        s, _ := grpcstatus.FromError(err)
        m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
        return s.Message(), ErrCreateContainerConfig
    }

    err = m.internalLifecycle.PreCreateContainer(pod, container, containerConfig)
    if err != nil {
        s, _ := grpcstatus.FromError(err)
        m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Internal PreCreateContainer hook failed: %v", s.Message())
        return s.Message(), ErrPreCreateHook
    }

    containerID, err := m.runtimeService.CreateContainer(podSandboxID, containerConfig, podSandboxConfig)
    if err != nil {
        s, _ := grpcstatus.FromError(err)
        m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
        return s.Message(), ErrCreateContainer
    }
    err = m.internalLifecycle.PreStartContainer(pod, container, containerID)
    if err != nil {
        s, _ := grpcstatus.FromError(err)
        m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Internal PreStartContainer hook failed: %v", s.Message())
        return s.Message(), ErrPreStartHook
    }
    m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.CreatedContainer, fmt.Sprintf("Created container %s", container.Name))

    // Step 3: start the container.
    err = m.runtimeService.StartContainer(containerID)
    if err != nil {
        s, _ := grpcstatus.FromError(err)
        m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", s.Message())
        return s.Message(), kubecontainer.ErrRunContainer
    }
    m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.StartedContainer, fmt.Sprintf("Started container %s", container.Name))

    // Symlink container logs to the legacy container log location for cluster logging
    // support.
    // TODO(random-liu): Remove this after cluster logging supports CRI container log path.
    containerMeta := containerConfig.GetMetadata()
    sandboxMeta := podSandboxConfig.GetMetadata()
    legacySymlink := legacyLogSymlink(containerID, containerMeta.Name, sandboxMeta.Name,
        sandboxMeta.Namespace)
    containerLog := filepath.Join(podSandboxConfig.LogDirectory, containerConfig.LogPath)
    // only create legacy symlink if containerLog path exists (or the error is not IsNotExist).
    // Because if containerLog path does not exist, only dangling legacySymlink is created.
    // This dangling legacySymlink is later removed by container gc, so it does not make sense
    // to create it in the first place. it happens when journald logging driver is used with docker.
    if _, err := m.osInterface.Stat(containerLog); !os.IsNotExist(err) {
        if err := m.osInterface.Symlink(containerLog, legacySymlink); err != nil {
            klog.ErrorS(err, "Failed to create legacy symbolic link", "path", legacySymlink,
                "containerID", containerID, "containerLogPath", containerLog)
        }
    }

    // Step 4: execute the post start hook.
    if container.Lifecycle != nil && container.Lifecycle.PostStart != nil {
        kubeContainerID := kubecontainer.ContainerID{
            Type: m.runtimeName,
            ID:   containerID,
        }
        msg, handlerErr := m.runner.Run(kubeContainerID, pod, container, container.Lifecycle.PostStart)
        if handlerErr != nil {
            klog.ErrorS(handlerErr, "Failed to execute PostStartHook", "pod", klog.KObj(pod),
                "podUID", pod.UID, "containerName", container.Name, "containerID", kubeContainerID.String())
            m.recordContainerEvent(pod, container, kubeContainerID.ID, v1.EventTypeWarning, events.FailedPostStartHook, msg)
            if err := m.killContainer(pod, kubeContainerID, container.Name, "FailedPostStartHook", reasonFailedPostStartHook, nil); err != nil {
                klog.ErrorS(err, "Failed to kill container", "pod", klog.KObj(pod),
                    "podUID", pod.UID, "containerName", container.Name, "containerID", kubeContainerID.String())
            }
            return msg, ErrPostStartHook
        }
    }

    return "", nil
}

// generateContainerConfig generates container config for kubelet runtime v1.
// 生成容器配置
func (m *kubeGenericRuntimeManager) generateContainerConfig(container *v1.Container, pod *v1.Pod, restartCount int, podIP, imageRef string, podIPs []string, nsTarget *kubecontainer.ContainerID) (*runtimeapi.ContainerConfig, func(), error) {
    opts, cleanupAction, err := m.runtimeHelper.GenerateRunContainerOptions(pod, container, podIP, podIPs)
    if err != nil {
        return nil, nil, err
    }

    uid, username, err := m.getImageUser(container.Image)
    if err != nil {
        return nil, cleanupAction, err
    }

    // Verify RunAsNonRoot. Non-root verification only supports numeric user.
    if err := verifyRunAsNonRoot(pod, container, uid, username); err != nil {
        return nil, cleanupAction, err
    }

    command, args := kubecontainer.ExpandContainerCommandAndArgs(container, opts.Envs)
    logDir := BuildContainerLogsDirectory(pod.Namespace, pod.Name, pod.UID, container.Name)
    err = m.osInterface.MkdirAll(logDir, 0755)
    if err != nil {
        return nil, cleanupAction, fmt.Errorf("create container log directory for container %s failed: %v", container.Name, err)
    }
    containerLogsPath := buildContainerLogsPath(container.Name, restartCount)
    restartCountUint32 := uint32(restartCount)
    config := &runtimeapi.ContainerConfig{
        Metadata: &runtimeapi.ContainerMetadata{
            Name:    container.Name,
            Attempt: restartCountUint32,
        },
        Image:       &runtimeapi.ImageSpec{Image: imageRef},
        Command:     command,
        Args:        args,
        WorkingDir:  container.WorkingDir,
        Labels:      newContainerLabels(container, pod),
        Annotations: newContainerAnnotations(container, pod, restartCount, opts),
        Devices:     makeDevices(opts),
        Mounts:      m.makeMounts(opts, container),
        LogPath:     containerLogsPath,
        Stdin:       container.Stdin,
        StdinOnce:   container.StdinOnce,
        Tty:         container.TTY,
    }

    // set platform specific configurations.
    // 设置适用于运行平台的配置，里面有各个平台下的cpu，memory资源分配和限制的实现
    if err := m.applyPlatformSpecificContainerConfig(config, container, pod, uid, username, nsTarget); err != nil {
        return nil, cleanupAction, err
    }

    // set environment variables
    envs := make([]*runtimeapi.KeyValue, len(opts.Envs))
    for idx := range opts.Envs {
        e := opts.Envs[idx]
        envs[idx] = &runtimeapi.KeyValue{
            Key:   e.Name,
            Value: e.Value,
        }
    }
    config.Envs = envs

    return config, cleanupAction, nil
}

// applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig.
func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error {
    enforceMemoryQoS := false
    // Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
    if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
        libcontainercgroups.IsCgroup2UnifiedMode() {
        enforceMemoryQoS = true
    }
    cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS)
    if err != nil {
        return err
    }
    config.Linux = cl
    return nil
}


// generateLinuxContainerConfig generates linux container config for kubelet runtime v1.
func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID, enforceMemoryQoS bool) (*runtimeapi.LinuxContainerConfig, error) {
    sc, err := m.determineEffectiveSecurityContext(pod, container, uid, username)
    if err != nil {
        return nil, err
    }
    lc := &runtimeapi.LinuxContainerConfig{
        Resources:       &runtimeapi.LinuxContainerResources{},
        SecurityContext: sc,
    }

    if nsTarget != nil && lc.SecurityContext.NamespaceOptions.Pid == runtimeapi.NamespaceMode_CONTAINER {
        lc.SecurityContext.NamespaceOptions.Pid = runtimeapi.NamespaceMode_TARGET
        lc.SecurityContext.NamespaceOptions.TargetId = nsTarget.ID
    }

    // set linux container resources
    var cpuRequest *resource.Quantity
    if _, cpuRequestExists := container.Resources.Requests[v1.ResourceCPU]; cpuRequestExists {
        cpuRequest = container.Resources.Requests.Cpu()
    }
    
    // 设置cpu和memory，可以看到只用到了request cpu，limit cpu和limit memory，没有用到request memory，因为memory是不可压缩资源，
    // memory超过了就直接oom了，pod的request memory是为了给调度器用的
    lc.Resources = m.calculateLinuxResources(cpuRequest, container.Resources.Limits.Cpu(), container.Resources.Limits.Memory())

    // 设置oom
    lc.Resources.OomScoreAdj = int64(qos.GetContainerOOMScoreAdjust(pod, container,
        int64(m.machineInfo.MemoryCapacity)))

    // 设置内存大页限制
    lc.Resources.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)
    ...
    ...
}

// calculateLinuxResources will create the linuxContainerResources type based on the provided CPU and memory resource requests, limits
// 将request cpu，limit cpu和limit memory转为CpuShares，MemoryLimitInBytes，CpuQuota，CpuPeriod这些配置传给docker
// docker会为容器设置进程所运行cgroup的cpu.share, cpu.quota, cpu.period, mem.limit
func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit, memoryLimit *resource.Quantity) *runtimeapi.LinuxContainerResources {
    resources := runtimeapi.LinuxContainerResources{}
    var cpuShares int64

    memLimit := memoryLimit.Value()

    // If request is not specified, but limit is, we want request to default to limit.
    // API server does this for new containers, but we repeat this logic in Kubelet
    // for containers running on existing Kubernetes clusters.
    if cpuRequest == nil && cpuLimit != nil {
        cpuShares = int64(cm.MilliCPUToShares(cpuLimit.MilliValue()))
    } else {
        // if cpuRequest.Amount is nil, then MilliCPUToShares will return the minimal number
        // of CPU shares.
        cpuShares = int64(cm.MilliCPUToShares(cpuRequest.MilliValue()))
    }
    resources.CpuShares = cpuShares
    if memLimit != 0 {
        resources.MemoryLimitInBytes = memLimit
    }

    if m.cpuCFSQuota {
        // if cpuLimit.Amount is nil, then the appropriate default value is returned
        // to allow full usage of cpu resource.
        cpuPeriod := int64(quotaPeriod)
        if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) {
            cpuPeriod = int64(m.cpuCFSQuotaPeriod.Duration / time.Microsecond)
        }
        cpuQuota := milliCPUToQuota(cpuLimit.MilliValue(), cpuPeriod)
        resources.CpuQuota = cpuQuota
        resources.CpuPeriod = cpuPeriod
    }

    return &resources
}

cgroup设置原理

k8s cgroup结构

我们采用CgroupPerQos的方式进行管理，以cpu子系统为例，层级类似如下所示

/sys/fs/cgroup/cpu
  kubepods
    guaranteed
      pod{uid}
        {containerid}
        {containerid}
      pod{uid}
        {containerid}
        {containerid}
    burstable
    bestaffort

从创建者的角度分两种：kubelet创建的、docker创建的。其中container层由docker创建，container以上的pod层、qos层和root（kubepods）都是由kubelet创建的。

那docker又是怎么知道容器的cgroup parent目录是谁呢？

其实是kubelet在调用docker api时传给docker的一个参数，告诉了其cgroup parent路径，
可以通过执行docker inspect {containerid} | grep -i cgroup来查看每个container的cgroup parent路径。

pod属性变化后，kubelet如何感知并让容器原地重启

kubelet会为每个container都计算出一个hash值，其中用到了container的所有属性，在调用docker
api进行容器创建的时候会把这个值设置到容器的Label中，后续如果kubelet检测到新计算出的hash值与在运行的容器的hash值不同，
则会进行容器的原地重启操作，这也是为什么修改container的Image会出发容器原地重启的原因。
很明显，如果放开Request的修改，Request值变了之后也会导致新的hash值变化从而导致容器重建，

为什么新的Pod cgroup目录创建出来之后，原有的目录没有被删除呢？

这就需要搞清楚Pod Cgroup目录什么时候删除的，容器级别的cgroup目录是在容器被删除的时候删除的，这个很好理解，
Pod级别的Cgroup目录是否也是在Pod删除时删除的呢？经过看代码发现并不是，Pod资源清理是一个异步的过程，
定时监测Pod是否已经设置了deletionTimestamp属性和容器的运行状态，只有设置了此属性的Pod才有可能被清理，
清理的过程中包含挂在卷、Cgroup等资源，会一并清理。因为修改Request的请求是不会去给Pod设置deletionTimestamp属性的，
这就导致Pod级别的旧目录不会被删除，又因为新目录的创建，导致同时存在两个Pod级别的目录。

crgoup子系统：

在这里插入图片描述

注意：

cpu.cfs_quota_us/cpu.cfs_period_us决定cpu控制组中所有进程所能使用CPU资源的最大值，而cpu.shares决定了cpu控制组间可用CPU的相对比例，这个比例只有当主机上的CPU完全被打满时才会起作用。

cgroup驱动：

在这里插入图片描述

cgroup cpu限制分析：

resources:
  requests:
    memory: 50Mi
    cpu: 50m
  limits:
    memory: 100Mi
    cpu: 100m

单位后缀 m 表示千分之一核，也就是说 1 Core = 1000m。因此该资源对象指定容器进程需要 50/1000 核（5%）才能被调度，并且允许最多使用 100/1000 核（10%）。同样，2000m 表示两个完整的 CPU 核心，你也可以写成 2 或者 2.0。

为了了解 Docker 和 cgroup 如何使用这些值来控制容器，我们首先创建一个只配置了 CPU requests 的 Pod：

$ kubectl run limit-test --image=busybox --requests "cpu=50m" --command -- /bin/sh -c "while true; do sleep 2; done"
deployment.apps "limit-test" created

通过 kubectl 命令我们可以验证这个 Pod 配置了 50m 的 CPU requests：

$ kubectl get pods limit-test-5b4c495556-p2xkr -o=jsonpath='{.spec.containers[0].resources}'
map[requests:map[cpu:50m]]

我们还可以看到 Docker 为容器配置了相同的资源限制：

$ docker ps | grep busy | cut -d' ' -f1
f2321226620e

$ docker inspect f2321226620e --format '{{.HostConfig.CpuShares}}'
51

这里显示的为什么是 51，而不是 50？
这是因为 Linux cgroup 和 Docker 都将 CPU 核心数分成了 1024 个时间片（shares），而 Kubernetes 将它分成了 1000 个 shares。

shares 用来设置 CPU 的相对值，并且是针对所有的 CPU（内核），默认值是 1024，
假如系统中有两个 cgroup，分别是 A 和 B，A 的 shares 值是 1024，B 的 shares 值是 512，
那么 A 将获得 1024/(1204+512)=66% 的 CPU 资源，而 B 将获得 33% 的 CPU 资源。

shares 有两个特点：

* 如果 A 不忙，没有使用到 66% 的 CPU 时间，那么剩余的 CPU 时间将会被系统分配给 B，
  即 B 的 CPU 使用率可以超过 33%。

* 如果添加了一个新的 cgroup C，且它的 shares 值是 1024，那么 A 的限额变成了 
  1024/(1204+512+1024)=40%，B 的变成了 20%。


从上面两个特点可以看出：

* 在闲的时候，shares 基本上不起作用，只有在 CPU 忙的时候起作用，这是一个优点。

* 由于 shares 是一个绝对值，需要和其它 cgroup 的值进行比较才能得到自己的相对限额，
  而在一个部署很多容器的机器上，cgroup 的数量是变化的，所以这个限额也是变化的，
  自己设置了一个高的值，但别人可能设置了一个更高的值，所以这个功能没法精确的控制 CPU 使用率。

与配置内存资源限制时 Docker 配置容器进程的内存 cgroup 的方式相同，设置 CPU 资源限制时 Docker 会配置容器进程的 cpu,cpuacct cgroup：

$ ps ax | grep /bin/sh
   60554 ?      Ss     0:00 /bin/sh -c while true; do sleep 2; done

$ sudo cat /proc/60554/cgroup
...
4:cpu,cpuacct:/kubepods/burstable/pode12b33b1-db07-11e8-b1e1-42010a800070/3be263e7a8372b12d2f8f8f9b4251f110b79c2a3bb9e6857b2f1473e640e8e75

$ ls -l /sys/fs/cgroup/cpu,cpuacct/kubepods/burstable/pode12b33b1-db07-11e8-b1e1-42010a800070/3be263e7a8372b12d2f8f8f9b4251f110b79c2a3bb9e6857b2f1473e640e8e75
total 0
drwxr-xr-x 2 root root 0 Oct 28 23:19 .
drwxr-xr-x 4 root root 0 Oct 28 23:19 ..
...
-rw-r--r-- 1 root root 0 Oct 28 23:19 cpu.shares

Docker 容器的 HostConfig.CpuShares 属性映射到 cgroup 的 cpu.shares 属性，可以验证一下：

$ sudo cat /sys/fs/cgroup/cpu,cpuacct/kubepods/burstable/podb5c03ddf-db10-11e8-b1e1-42010a800070/64b5f1b636dafe6635ddd321c5b36854a8add51931c7117025a694281fb11444/cpu.shares

51

你可能会很惊讶，设置了 CPU requests 竟然会把值传播到 cgroup，而在上一篇文章中我们设置内存 requests 时并没有将值传播到 cgroup。这是因为内存的 soft limit 内核特性对 Kubernetes 不起作用，而设置了 cpu.shares 却对 Kubernetes 很有用。后面我会详细讨论为什么会这样。现在让我们先看看设置 CPU limits 时会发生什么：

$ kubectl run limit-test --image=busybox --requests "cpu=50m" --limits "cpu=100m" --command -- /bin/sh -c "while true; do
sleep 2; done"
deployment.apps "limit-test" created

再一次使用 kubectl 验证我们的资源配置：

$ kubectl get pods limit-test-5b4fb64549-qpd4n -o=jsonpath='{.spec.containers[0].resources}'

map[limits:map[cpu:100m] requests:map[cpu:50m]]

查看对应的 Docker 容器的配置：

$ docker ps | grep busy | cut -d' ' -f1
f2321226620e
$ docker inspect 472abbce32a5 --format '{{.HostConfig.CpuShares}} {{.HostConfig.CpuQuota}} {{.HostConfig.CpuPeriod}}'
51 10000 100000

可以明显看出，CPU requests 对应于 Docker 容器的 HostConfig.CpuShares 属性。而 CPU limits 就不太明显了，它由两个属性控制：HostConfig.CpuPeriod 和 HostConfig.CpuQuota。Docker 容器中的这两个属性又会映射到进程的 cpu,couacct cgroup 的另外两个属性：cpu.cfs_period_us 和 cpu.cfs_quota_us。我们来看一下：

$ sudo cat /sys/fs/cgroup/cpu,cpuacct/kubepods/burstable/pod2f1b50b6-db13-11e8-b1e1-42010a800070/f0845c65c3073e0b7b0b95ce0c1eb27f69d12b1fe2382b50096c4b59e78cdf71/cpu.cfs_period_us
100000

$ sudo cat /sys/fs/cgroup/cpu,cpuacct/kubepods/burstable/pod2f1b50b6-db13-11e8-b1e1-42010a800070/f0845c65c3073e0b7b0b95ce0c1eb27f69d12b1fe2382b50096c4b59e78cdf71/cpu.cfs_quota_us
10000

这些值与容器配置中指定的值相同。但是这两个属性的值是如何从我们在 Pod 中设置的 100m cpu limits 得出的呢，他们是如何实现该 limits 的呢？

这是因为 cpu requests 和 cpu limits 是使用两个独立的控制系统来实现的。Requests 使用的是 cpu shares 系统，cpu shares 将每个 CPU 核心划分为 1024 个时间片，并保证每个进程将获得固定比例份额的时间片。如果总共有 1024 个时间片，并且两个进程中的每一个都将 cpu.shares 设置为 512，那么它们将分别获得大约一半的 CPU 可用时间。但 cpu shares 系统无法精确控制 CPU 使用率的上限，如果一个进程没有设置 shares，则另一个进程可用自由使用 CPU 资源。
大约在 2010 年左右，谷歌团队和其他一部分人注意到了这个问题。为了解决这个问题，后来在 linux 内核中增加了第二个功能更强大的控制系统 : CPU 带宽控制组。带宽控制组定义了一个周期，通常为 1/10 秒（即 100000 微秒）。还定义了一个配额，表示允许进程在设置的周期长度内所能使用的 CPU 时间数，两个文件配合起来设置CPU的使用上限。两个文件的单位都是微秒（us），cfs_period_us 的取值范围为 1 毫秒（ms）到 1 秒（s），cfs_quota_us 的取值大于 1ms 即可，如果 cfs_quota_us 的值为 -1（默认值），表示不受 CPU 时间的限制。

下面是几个例子：

# 1.限制只能使用1个CPU（每250ms能使用250ms的CPU时间）
$ echo 250000 > cpu.cfs_quota_us /* quota = 250ms */
$ echo 250000 > cpu.cfs_period_us /* period = 250ms */

# 2.限制使用2个CPU（内核）（每500ms能使用1000ms的CPU时间，即使用两个内核）
$ echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */
$ echo 500000 > cpu.cfs_period_us /* period = 500ms */

# 3.限制使用1个CPU的20%（每50ms能使用10ms的CPU时间，即使用一个CPU核心的20%）
$ echo 10000 > cpu.cfs_quota_us /* quota = 10ms */
$ echo 50000 > cpu.cfs_period_us /* period = 50ms */

在本例中我们将 Pod 的 cpu limits 设置为 100m，这表示 100/1000 个 CPU 核心，即 100000 微秒的 CPU 时间周期中的 10000。所以该 limits 翻译到 cpu,cpuacct cgroup 中被设置为 cpu.cfs_period_us=100000 和 cpu.cfs_quota_us=10000。顺便说一下，其中的 cfs 代表 Completely Fair Scheduler（绝对公平调度），这是 Linux 系统中默认的 CPU 调度算法。还有一个实时调度算法，它也有自己相应的配额值。

现在让我们来总结一下：

在 Kubernetes 中设置的 cpu requests 最终会被 cgroup 设置为 cpu.shares 属性的值， cpu limits 会被带宽控制组设置为 cpu.cfs_period_us 和 cpu.cfs_quota_us 属性的值。与内存一样，cpu requests 主要用于在调度时通知调度器节点上至少需要多少个 cpu shares 才可以被调度。
与内存 requests 不同，设置了 cpu requests 会在 cgroup 中设置一个属性，以确保内核会将该数量的 shares 分配给进程。
cpu limits 与内存 limits 也有所不同。如果容器进程使用的内存资源超过了内存使用限制，那么该进程将会成为 oom-killing 的候选者。但是容器进程基本上永远不能超过设置的 CPU 配额，所以容器永远不会因为尝试使用比分配的更多的 CPU 时间而被驱逐。系统会在调度程序中强制进行 CPU 资源限制，以确保进程不会超过这个限制。

如果你没有在容器中设置这些属性，或将他们设置为不准确的值，会发生什么呢？与内存一样，如果只设置了 limits 而没有设置 requests，Kubernetes 会将 CPU 的 requests 设置为与 limits 的值一样。如果你对你的工作负载所需要的 CPU 时间了如指掌，那再好不过了。如果只设置了 CPU requests 却没有设置 CPU limits 会怎么样呢？这种情况下，Kubernetes 会确保该 Pod 被调度到合适的节点，并且该节点的内核会确保节点上的可用 cpu shares 大于 Pod 请求的 cpu shares，但是你的进程不会被阻止使用超过所请求的 CPU 数量。既不设置 requests 也不设置 limits 是最糟糕的情况：调度程序不知道容器需要什么，并且进程对 cpu shares 的使用是无限制的，这可能会对 node 产生一些负面影响。