【kubernetes/k8s源码分析】 kubelet pod cgroup 源码分析

最新推荐文章于 2023-06-23 21:44:28 发布

张忠琳

最新推荐文章于 2023-06-23 21:44:28 发布

阅读量3.1k

点赞数 2

分类专栏： # kubelet

本文链接：https://blog.csdn.net/zhonglinzhang/article/details/93031810

版权

kubelet 专栏收录该内容

15 篇文章 6 订阅

订阅专栏

创建 pod 时，为其设置资源限额 cgroup

syncPod

--> kl.containerManager.NewPodContainerManager

--> pcm.Exists(pod)

--> kl.containerManager.UpdateQOSCgroups()

1. NewPodContainerManager 函数

实例化 podContainerManagerImpl，实现了 PodContainerManager 接口

// NewPodContainerManager is a factory method returns a PodContainerManager object
// If qosCgroups are enabled then it returns the general pod container manager
// otherwise it returns a no-op manager which essentially does nothing
func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
	if cm.NodeConfig.CgroupsPerQOS {
		return &podContainerManagerImpl{
			qosContainersInfo: cm.GetQOSContainersInfo(),
			subsystems:        cm.subsystems,
			cgroupManager:     cm.cgroupManager,
			podPidsLimit:      cm.ExperimentalPodPidsLimit,
			enforceCPULimits:  cm.EnforceCPULimits,
			cpuCFSQuotaPeriod: uint64(cm.CPUCFSQuotaPeriod / time.Microsecond),
		}
	}
	return &podContainerManagerNoop{
		cgroupRoot: cm.cgroupRoot,
	}
}

2. Exists

调用 GetPodContainerName 获得 pod 的资源限额路径

调用 cgroup manager 的 Exists 检验 cgroup 子系统的存在 "cpu", "cpuacct", "cpuset", "memory", "systemd"

// Exists checks if the pod's cgroup already exists
func (m *podContainerManagerImpl) Exists(pod *v1.Pod) bool {
	podContainerName, _ := m.GetPodContainerName(pod)
	return m.cgroupManager.Exists(podContainerName)
}

2.1 GetPodContainerName

podContainer 为 pod + ${pod_uuid}，根据 Qos 级别，设置相应目录进行资源限额

// GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, string) {
	podQOS := v1qos.GetPodQOS(pod)
	// Get the parent QOS container name
	var parentContainer CgroupName
	switch podQOS {
	case v1.PodQOSGuaranteed:
		parentContainer = m.qosContainersInfo.Guaranteed
	case v1.PodQOSBurstable:
		parentContainer = m.qosContainersInfo.Burstable
	case v1.PodQOSBestEffort:
		parentContainer = m.qosContainersInfo.BestEffort
	}
	podContainer := GetPodCgroupNameSuffix(pod.UID)

	// Get the absolute path of the cgroup
	cgroupName := NewCgroupName(parentContainer, podContainer)
	// Get the literal cgroupfs name
	cgroupfsName := m.cgroupManager.Name(cgroupName)

	return cgroupName, cgroupfsName
}

3. UpdateQOSCgroups

func (cm *containerManagerImpl) UpdateQOSCgroups() error {
	return cm.qosContainerManager.UpdateCgroups()
}

3.1 UpdateCgroups

func (m *qosContainerManagerImpl) UpdateCgroups() error {
	m.Lock()
	defer m.Unlock()

	qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
		v1.PodQOSBurstable: {
			Name:               m.qosContainersInfo.Burstable,
			ResourceParameters: &ResourceConfig{},
		},
		v1.PodQOSBestEffort: {
			Name:               m.qosContainersInfo.BestEffort,
			ResourceParameters: &ResourceConfig{},
		},
	}

3.1.1 setCPUCgroupConfig 函数

对于 BestEffort 保证 CPU share 为 2，Burstable 的根据 MilliCPUToShares函数定 cpu share，所有 pod 为 Burstable 的CPU request 总和 * 1024 /1000

func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
	pods := m.activePods()
	burstablePodCPURequest := int64(0)
	for i := range pods {
		pod := pods[i]
		qosClass := v1qos.GetPodQOS(pod)
		if qosClass != v1.PodQOSBurstable {
			// we only care about the burstable qos tier
			continue
		}
		req, _ := resource.PodRequestsAndLimits(pod)
		if request, found := req[v1.ResourceCPU]; found {
			burstablePodCPURequest += request.MilliValue()
		}
	}

	// make sure best effort is always 2 shares
	bestEffortCPUShares := uint64(MinShares)
	configs[v1.PodQOSBestEffort].ResourceParameters.CpuShares = &bestEffortCPUShares

	// set burstable shares based on current observe state
	burstableCPUShares := MilliCPUToShares(burstablePodCPURequest)
	configs[v1.PodQOSBurstable].ResourceParameters.CpuShares = &burstableCPUShares
	return nil
}

3.1.2 设置 Hugepage

根据 HugePages=true|false (BETA - default=true)，读取这个目录下 /sys/kernel/mm/hugepages，一般默认为 2MB

// update the qos level cgroup settings for huge pages (ensure they remain unbounded)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
	if err := m.setHugePagesConfig(qosConfigs); err != nil {
		return err
	}
}

3.1.3 设置 QOS reserved

根据 QOSReserved=true|false (ALPHA - default=false)，将不执行该流程，暂且先过

	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) {
		for resource, percentReserve := range m.qosReserved {
			switch resource {
			case v1.ResourceMemory:
				m.setMemoryReserve(qosConfigs, percentReserve)
			}
		}

		updateSuccess := true
		for _, config := range qosConfigs {
			err := m.cgroupManager.Update(config)
			if err != nil {
				updateSuccess = false
			}
		}
		if updateSuccess {
			klog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration")
			return nil
		}

		// If the resource can adjust the ResourceConfig to increase likelihood of
		// success, call the adjustment function here.  Otherwise, the Update() will
		// be called again with the same values.
		for resource, percentReserve := range m.qosReserved {
			switch resource {
			case v1.ResourceMemory:
				m.retrySetMemoryReserve(qosConfigs, percentReserve)
			}
		}
	}

3.1.4 对 BestEffort 和 Burstable 进行更新资源限额

for _, config := range qosConfigs {
	err := m.cgroupManager.Update(config)
	if err != nil {
		klog.Errorf("[ContainerManager]: Failed to update QoS cgroup configuration")
		return err
	}
}

4. EnsureExists

确认 pod 存在，不存在则创建

// EnsureExists takes a pod as argument and makes sure that
// pod cgroup exists if qos cgroup hierarchy flag is enabled.
// If the pod level container doesn't already exist it is created.
func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
	podContainerName, _ := m.GetPodContainerName(pod)
	// check if container already exist
	alreadyExists := m.Exists(pod)
	if !alreadyExists {
		// Create the pod container
		containerConfig := &CgroupConfig{
			Name:               podContainerName,
			ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod),
		}
		if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && m.podPidsLimit > 0 {
			containerConfig.ResourceParameters.PodPidsLimit = &m.podPidsLimit
		}
		if err := m.cgroupManager.Create(containerConfig); err != nil {
			return fmt.Errorf("failed to create container for %v : %v", podContainerName, err)
		}
	}
	// Apply appropriate resource limits on the pod container
	// Top level qos containers limits are not updated
	// until we figure how to maintain the desired state in the kubelet.
	// Because maintaining the desired state is difficult without checkpointing.
	if err := m.applyLimits(pod); err != nil {
		return fmt.Errorf("failed to apply resource limits on container for %v : %v", podContainerName, err)
	}
	return nil
}

总结

# systemd-cgls
Working Directory /sys/fs/cgroup/memory/kubepods:
└─burstable
└─poda2edde4d-930b-11e9-9e6a-080027603363
├─48d1be201c4cc9853e7afeb2f30296043603a84f803870e336c429a0e0a437ae
│ └─12457 mysqld
└─33fb08924385c5ddf0aa23e3990beaca6aa2c221fa480ec85897d6465dc2c8cc
└─12386 /pause

张忠琳

关注

2
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
【kubernetes/k8s源码分析】 kubelet pod cgroup 源码分析

创建 pod 时，为其设置资源限额 cgroupsyncPod -->kl.containerManager.NewPodContainerManager -->pcm.Exists(pod) --> kl.containerManager.UpdateQOSCgroups()1.NewPodConta...
复制链接

扫一扫