【kubernetes/k8s源码分析】 kubelet qos cgroup 源码分析

 

    在kubernetes中,每个POD都有个QoS标记,通过这个Qos标记来对POD进行服务质量管理。

 

qos 级别

  • Guaranteed:pod 里每个容器都必须设定 request 和 limit,并且值必须相同
  • Burstable: pod 里至少有一个容器的 cpu 或者 memory request 的值设定了
  • BestEffort:POD 的所有容器都没有指定CPU和内存的requests和limits

 

从容器的角度出发,为了限制容器使用的CPU和内存,是通过cgroup来实现的,目前kubernetes的QoS只能管理CPU和内存,所以kubernetes现在也是通过对cgroup的配置来实现QoS管理的。
 

2. start qos container manager

     启用新的 cgroup 层次结构,启用 QoS 和 Pod 级别的 cgroups kubepods

      createNodeAllocatableCgroups 创建新的根 cgroup kubepods

// Setup top level qos containers only if CgroupsPerQOS flag is specified as true
if cm.NodeConfig.CgroupsPerQOS {
	if err := cm.createNodeAllocatableCgroups(); err != nil {
		return err
	}
	err = cm.qosContainerManager.Start(cm.getNodeAllocatableAbsolute, activePods)
	if err != nil {
		return fmt.Errorf("failed to initialize top level QOS containers: %v", err)
	}
}

     2.1 根 cgroup 为 kubepods,qos 级别 

  • BestEffort:POD 中的所有容器都没有指定 CPU 和内存的 requests和limits
  • Burstable:POD 中只要有一个容器,这个容器 requests 和 limits 的设置同其他容器设置的不一致
  • Guaranteed:POD 中所有容器都必须统一设置了 limits,并且设置参数都一致,如果有一个容器要设置requests,那么所有容器都要设置,并设置参数同limits一致
func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error {
	cm := m.cgroupManager
	rootContainer := m.cgroupRoot
	if !cm.Exists(rootContainer) {
		return fmt.Errorf("root container %v doesn't exist", rootContainer)
	}

	// Top level for Qos containers are created only for Burstable
	// and Best Effort classes
	qosClasses := map[v1.PodQOSClass]CgroupName{
		v1.PodQOSBurstable:  NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBurstable))),
		v1.PodQOSBestEffort: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBestEffort))),
	}

     2.2 创建 qos 级别 BestEffort 和 Burstable

      BestEffort 级别的 cpushare 设置为 2,不知道为啥这么设置,没有的则创建,如果存在则更新

// Create containers for both qos classes
for qosClass, containerName := range qosClasses {
	resourceParameters := &ResourceConfig{}
	// the BestEffort QoS class has a statically configured minShares value
	if qosClass == v1.PodQOSBestEffort {
		minShares := uint64(MinShares)
		resourceParameters.CpuShares = &minShares
	}

	// containerConfig object stores the cgroup specifications
	containerConfig := &CgroupConfig{
		Name:               containerName,
		ResourceParameters: resourceParameters,
	}

	// for each enumerated huge page size, the qos tiers are unbounded
	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
		m.setHugePagesUnbounded(containerConfig)
	}

	// check if it exists
	if !cm.Exists(containerName) {
		if err := cm.Create(containerConfig); err != nil {
			return fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err)
		}
	} else {
		// to ensure we actually have the right state, we update the config on startup
		if err := cm.Update(containerConfig); err != nil {
			return fmt.Errorf("failed to update top level %v QOS cgroup : %v", qosClass, err)
		}
	}
}

    2.3 缓存 qos 信息,Guaranteed 直接使用 kubepods

// Store the top level qos container names
m.qosContainersInfo = QOSContainersInfo{
	Guaranteed: rootContainer,
	Burstable:  qosClasses[v1.PodQOSBurstable],
	BestEffort: qosClasses[v1.PodQOSBestEffort],
}
m.getNodeAllocatable = getNodeAllocatable
m.activePods = activePods

    2.4 间隔 1s 调用 UpdateCgroups 更新

// update qos cgroup tiers on startup and in periodic intervals
// to ensure desired state is in sync with actual state.
go wait.Until(func() {
	err := m.UpdateCgroups()
	if err != nil {
		klog.Warningf("[ContainerManager] Failed to reserve QoS requests: %v", err)
	}
}, periodicQOSCgroupUpdateInterval, wait.NeverStop)

   

3. UpdateCgroups

func (m *qosContainerManagerImpl) UpdateCgroups() error {
	m.Lock()
	defer m.Unlock()

	qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
		v1.PodQOSBurstable: {
			Name:               m.qosContainersInfo.Burstable,
			ResourceParameters: &ResourceConfig{},
		},
		v1.PodQOSBestEffort: {
			Name:               m.qosContainersInfo.BestEffort,
			ResourceParameters: &ResourceConfig{},
		},
	}

    3.1 setCPUCgroupConfig 函数

     对于 BestEffort 保证 CPU share 为 2,Burstable 的根据 MilliCPUToShares函数定 cpu share,所有 pod 为 Burstable 的CPU request 总和 * 1024 /1000

func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
	pods := m.activePods()
	burstablePodCPURequest := int64(0)
	for i := range pods {
		pod := pods[i]
		qosClass := v1qos.GetPodQOS(pod)
		if qosClass != v1.PodQOSBurstable {
			// we only care about the burstable qos tier
			continue
		}
		req, _ := resource.PodRequestsAndLimits(pod)
		if request, found := req[v1.ResourceCPU]; found {
			burstablePodCPURequest += request.MilliValue()
		}
	}

	// make sure best effort is always 2 shares
	bestEffortCPUShares := uint64(MinShares)
	configs[v1.PodQOSBestEffort].ResourceParameters.CpuShares = &bestEffortCPUShares

	// set burstable shares based on current observe state
	burstableCPUShares := MilliCPUToShares(burstablePodCPURequest)
	configs[v1.PodQOSBurstable].ResourceParameters.CpuShares = &burstableCPUShares
	return nil
}

    3.2  设置 Hugepage

     根据 HugePages=true|false (BETA - default=true),读取这个 目录下 /sys/kernel/mm/hugepages,一般默认为 2MB

// update the qos level cgroup settings for huge pages (ensure they remain unbounded)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
	if err := m.setHugePagesConfig(qosConfigs); err != nil {
		return err
	}
}

    3.3 设置 QOS reserved 

    根据 QOSReserved=true|false (ALPHA - default=false),将不执行该流程,暂且先过

	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) {
		for resource, percentReserve := range m.qosReserved {
			switch resource {
			case v1.ResourceMemory:
				m.setMemoryReserve(qosConfigs, percentReserve)
			}
		}

		updateSuccess := true
		for _, config := range qosConfigs {
			err := m.cgroupManager.Update(config)
			if err != nil {
				updateSuccess = false
			}
		}
		if updateSuccess {
			klog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration")
			return nil
		}

		// If the resource can adjust the ResourceConfig to increase likelihood of
		// success, call the adjustment function here.  Otherwise, the Update() will
		// be called again with the same values.
		for resource, percentReserve := range m.qosReserved {
			switch resource {
			case v1.ResourceMemory:
				m.retrySetMemoryReserve(qosConfigs, percentReserve)
			}
		}
	}

    3.4 对 BestEffort 和 Burstable 进行更新资源限额

for _, config := range qosConfigs {
	err := m.cgroupManager.Update(config)
	if err != nil {
		klog.Errorf("[ContainerManager]: Failed to update QoS cgroup configuration")
		return err
	}
}

 

总结:

   在根 cgroup kubepods 创建 besteffort burstable 设置 besteffort cpu share 为 2

   Guaranteed 直接使用 kubepods 资源限制

   定期间隔 1s 更新 cgroup,为 Burstable 设置 cpu share 值,更新 Burstable 和 besteffort 资源限额

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值