创建 pod 时,为其设置资源限额 cgroup
syncPod
--> kl.containerManager.NewPodContainerManager
--> pcm.Exists(pod)
--> kl.containerManager.UpdateQOSCgroups()
1. NewPodContainerManager 函数
实例化 podContainerManagerImpl,实现了 PodContainerManager 接口
// NewPodContainerManager is a factory method returns a PodContainerManager object
// If qosCgroups are enabled then it returns the general pod container manager
// otherwise it returns a no-op manager which essentially does nothing
func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
if cm.NodeConfig.CgroupsPerQOS {
return &podContainerManagerImpl{
qosContainersInfo: cm.GetQOSContainersInfo(),
subsystems: cm.subsystems,
cgroupManager: cm.cgroupManager,
podPidsLimit: cm.ExperimentalPodPidsLimit,
enforceCPULimits: cm.EnforceCPULimits,
cpuCFSQuotaPeriod: uint64(cm.CPUCFSQuotaPeriod / time.Microsecond),
}
}
return &podContainerManagerNoop{
cgroupRoot: cm.cgroupRoot,
}
}
2. Exists
调用 GetPodContainerName 获得 pod 的资源限额路径
调用 cgroup manager 的 Exists 检验 cgroup 子系统的存在 "cpu", "cpuacct", "cpuset", "memory", "systemd"
// Exists checks if the pod's cgroup already exists
func (m *podContainerManagerImpl) Exists(pod *v1.Pod) bool {
podContainerName, _ := m.GetPodContainerName(pod)
return m.cgroupManager.Exists(podContainerName)
}
2.1 GetPodContainerName
podContainer 为 pod + ${pod_uuid},根据 Qos 级别,设置相应目录进行资源限额
// GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, string) {
podQOS := v1qos.GetPodQOS(pod)
// Get the parent QOS container name
var parentContainer CgroupName
switch podQOS {
case v1.PodQOSGuaranteed:
parentContainer = m.qosContainersInfo.Guaranteed
case v1.PodQOSBurstable:
parentContainer = m.qosContainersInfo.Burstable
case v1.PodQOSBestEffort:
parentContainer = m.qosContainersInfo.BestEffort
}
podContainer := GetPodCgroupNameSuffix(pod.UID)
// Get the absolute path of the cgroup
cgroupName := NewCgroupName(parentContainer, podContainer)
// Get the literal cgroupfs name
cgroupfsName := m.cgroupManager.Name(cgroupName)
return cgroupName, cgroupfsName
}
3. UpdateQOSCgroups
func (cm *containerManagerImpl) UpdateQOSCgroups() error {
return cm.qosContainerManager.UpdateCgroups()
}
3.1 UpdateCgroups
func (m *qosContainerManagerImpl) UpdateCgroups() error {
m.Lock()
defer m.Unlock()
qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
v1.PodQOSBurstable: {
Name: m.qosContainersInfo.Burstable,
ResourceParameters: &ResourceConfig{},
},
v1.PodQOSBestEffort: {
Name: m.qosContainersInfo.BestEffort,
ResourceParameters: &ResourceConfig{},
},
}
3.1.1 setCPUCgroupConfig 函数
对于 BestEffort 保证 CPU share 为 2,Burstable 的根据 MilliCPUToShares函数定 cpu share,所有 pod 为 Burstable 的CPU request 总和 * 1024 /1000
func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
pods := m.activePods()
burstablePodCPURequest := int64(0)
for i := range pods {
pod := pods[i]
qosClass := v1qos.GetPodQOS(pod)
if qosClass != v1.PodQOSBurstable {
// we only care about the burstable qos tier
continue
}
req, _ := resource.PodRequestsAndLimits(pod)
if request, found := req[v1.ResourceCPU]; found {
burstablePodCPURequest += request.MilliValue()
}
}
// make sure best effort is always 2 shares
bestEffortCPUShares := uint64(MinShares)
configs[v1.PodQOSBestEffort].ResourceParameters.CpuShares = &bestEffortCPUShares
// set burstable shares based on current observe state
burstableCPUShares := MilliCPUToShares(burstablePodCPURequest)
configs[v1.PodQOSBurstable].ResourceParameters.CpuShares = &burstableCPUShares
return nil
}
3.1.2 设置 Hugepage
根据 HugePages=true|false (BETA - default=true),读取这个 目录下 /sys/kernel/mm/hugepages,一般默认为 2MB
// update the qos level cgroup settings for huge pages (ensure they remain unbounded)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
if err := m.setHugePagesConfig(qosConfigs); err != nil {
return err
}
}
3.1.3 设置 QOS reserved
根据 QOSReserved=true|false (ALPHA - default=false),将不执行该流程,暂且先过
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) {
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.setMemoryReserve(qosConfigs, percentReserve)
}
}
updateSuccess := true
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
updateSuccess = false
}
}
if updateSuccess {
klog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration")
return nil
}
// If the resource can adjust the ResourceConfig to increase likelihood of
// success, call the adjustment function here. Otherwise, the Update() will
// be called again with the same values.
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.retrySetMemoryReserve(qosConfigs, percentReserve)
}
}
}
3.1.4 对 BestEffort 和 Burstable 进行更新资源限额
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
klog.Errorf("[ContainerManager]: Failed to update QoS cgroup configuration")
return err
}
}
4. EnsureExists
确认 pod 存在,不存在则创建
// EnsureExists takes a pod as argument and makes sure that
// pod cgroup exists if qos cgroup hierarchy flag is enabled.
// If the pod level container doesn't already exist it is created.
func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
podContainerName, _ := m.GetPodContainerName(pod)
// check if container already exist
alreadyExists := m.Exists(pod)
if !alreadyExists {
// Create the pod container
containerConfig := &CgroupConfig{
Name: podContainerName,
ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod),
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && m.podPidsLimit > 0 {
containerConfig.ResourceParameters.PodPidsLimit = &m.podPidsLimit
}
if err := m.cgroupManager.Create(containerConfig); err != nil {
return fmt.Errorf("failed to create container for %v : %v", podContainerName, err)
}
}
// Apply appropriate resource limits on the pod container
// Top level qos containers limits are not updated
// until we figure how to maintain the desired state in the kubelet.
// Because maintaining the desired state is difficult without checkpointing.
if err := m.applyLimits(pod); err != nil {
return fmt.Errorf("failed to apply resource limits on container for %v : %v", podContainerName, err)
}
return nil
}
总结
# systemd-cgls
Working Directory /sys/fs/cgroup/memory/kubepods:
└─burstable
└─poda2edde4d-930b-11e9-9e6a-080027603363
├─48d1be201c4cc9853e7afeb2f30296043603a84f803870e336c429a0e0a437ae
│ └─12457 mysqld
└─33fb08924385c5ddf0aa23e3990beaca6aa2c221fa480ec85897d6465dc2c8cc
└─12386 /pause