在kubernetes中,每个POD都有个QoS标记,通过这个Qos标记来对POD进行服务质量管理。
qos 级别
- Guaranteed:pod 里每个容器都必须设定 request 和 limit,并且值必须相同
- Burstable: pod 里至少有一个容器的 cpu 或者 memory request 的值设定了
- BestEffort:POD 的所有容器都没有指定CPU和内存的requests和limits
从容器的角度出发,为了限制容器使用的CPU和内存,是通过cgroup来实现的,目前kubernetes的QoS只能管理CPU和内存,所以kubernetes现在也是通过对cgroup的配置来实现QoS管理的。
2. start qos container manager
启用新的 cgroup 层次结构,启用 QoS 和 Pod 级别的 cgroups kubepods
createNodeAllocatableCgroups 创建新的根 cgroup kubepods
// Setup top level qos containers only if CgroupsPerQOS flag is specified as true
if cm.NodeConfig.CgroupsPerQOS {
if err := cm.createNodeAllocatableCgroups(); err != nil {
return err
}
err = cm.qosContainerManager.Start(cm.getNodeAllocatableAbsolute, activePods)
if err != nil {
return fmt.Errorf("failed to initialize top level QOS containers: %v", err)
}
}
2.1 根 cgroup 为 kubepods,qos 级别
- BestEffort:POD 中的所有容器都没有指定 CPU 和内存的 requests和limits
- Burstable:POD 中只要有一个容器,这个容器 requests 和 limits 的设置同其他容器设置的不一致
- Guaranteed:POD 中所有容器都必须统一设置了 limits,并且设置参数都一致,如果有一个容器要设置requests,那么所有容器都要设置,并设置参数同limits一致
func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error {
cm := m.cgroupManager
rootContainer := m.cgroupRoot
if !cm.Exists(rootContainer) {
return fmt.Errorf("root container %v doesn't exist", rootContainer)
}
// Top level for Qos containers are created only for Burstable
// and Best Effort classes
qosClasses := map[v1.PodQOSClass]CgroupName{
v1.PodQOSBurstable: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBurstable))),
v1.PodQOSBestEffort: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBestEffort))),
}
2.2 创建 qos 级别 BestEffort 和 Burstable
BestEffort 级别的 cpushare 设置为 2,不知道为啥这么设置,没有的则创建,如果存在则更新
// Create containers for both qos classes
for qosClass, containerName := range qosClasses {
resourceParameters := &ResourceConfig{}
// the BestEffort QoS class has a statically configured minShares value
if qosClass == v1.PodQOSBestEffort {
minShares := uint64(MinShares)
resourceParameters.CpuShares = &minShares
}
// containerConfig object stores the cgroup specifications
containerConfig := &CgroupConfig{
Name: containerName,
ResourceParameters: resourceParameters,
}
// for each enumerated huge page size, the qos tiers are unbounded
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
m.setHugePagesUnbounded(containerConfig)
}
// check if it exists
if !cm.Exists(containerName) {
if err := cm.Create(containerConfig); err != nil {
return fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err)
}
} else {
// to ensure we actually have the right state, we update the config on startup
if err := cm.Update(containerConfig); err != nil {
return fmt.Errorf("failed to update top level %v QOS cgroup : %v", qosClass, err)
}
}
}
2.3 缓存 qos 信息,Guaranteed 直接使用 kubepods
// Store the top level qos container names
m.qosContainersInfo = QOSContainersInfo{
Guaranteed: rootContainer,
Burstable: qosClasses[v1.PodQOSBurstable],
BestEffort: qosClasses[v1.PodQOSBestEffort],
}
m.getNodeAllocatable = getNodeAllocatable
m.activePods = activePods
2.4 间隔 1s 调用 UpdateCgroups 更新
// update qos cgroup tiers on startup and in periodic intervals
// to ensure desired state is in sync with actual state.
go wait.Until(func() {
err := m.UpdateCgroups()
if err != nil {
klog.Warningf("[ContainerManager] Failed to reserve QoS requests: %v", err)
}
}, periodicQOSCgroupUpdateInterval, wait.NeverStop)
3. UpdateCgroups
func (m *qosContainerManagerImpl) UpdateCgroups() error {
m.Lock()
defer m.Unlock()
qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
v1.PodQOSBurstable: {
Name: m.qosContainersInfo.Burstable,
ResourceParameters: &ResourceConfig{},
},
v1.PodQOSBestEffort: {
Name: m.qosContainersInfo.BestEffort,
ResourceParameters: &ResourceConfig{},
},
}
3.1 setCPUCgroupConfig 函数
对于 BestEffort 保证 CPU share 为 2,Burstable 的根据 MilliCPUToShares函数定 cpu share,所有 pod 为 Burstable 的CPU request 总和 * 1024 /1000
func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
pods := m.activePods()
burstablePodCPURequest := int64(0)
for i := range pods {
pod := pods[i]
qosClass := v1qos.GetPodQOS(pod)
if qosClass != v1.PodQOSBurstable {
// we only care about the burstable qos tier
continue
}
req, _ := resource.PodRequestsAndLimits(pod)
if request, found := req[v1.ResourceCPU]; found {
burstablePodCPURequest += request.MilliValue()
}
}
// make sure best effort is always 2 shares
bestEffortCPUShares := uint64(MinShares)
configs[v1.PodQOSBestEffort].ResourceParameters.CpuShares = &bestEffortCPUShares
// set burstable shares based on current observe state
burstableCPUShares := MilliCPUToShares(burstablePodCPURequest)
configs[v1.PodQOSBurstable].ResourceParameters.CpuShares = &burstableCPUShares
return nil
}
3.2 设置 Hugepage
根据 HugePages=true|false (BETA - default=true),读取这个 目录下 /sys/kernel/mm/hugepages,一般默认为 2MB
// update the qos level cgroup settings for huge pages (ensure they remain unbounded)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
if err := m.setHugePagesConfig(qosConfigs); err != nil {
return err
}
}
3.3 设置 QOS reserved
根据 QOSReserved=true|false (ALPHA - default=false),将不执行该流程,暂且先过
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) {
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.setMemoryReserve(qosConfigs, percentReserve)
}
}
updateSuccess := true
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
updateSuccess = false
}
}
if updateSuccess {
klog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration")
return nil
}
// If the resource can adjust the ResourceConfig to increase likelihood of
// success, call the adjustment function here. Otherwise, the Update() will
// be called again with the same values.
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.retrySetMemoryReserve(qosConfigs, percentReserve)
}
}
}
3.4 对 BestEffort 和 Burstable 进行更新资源限额
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
klog.Errorf("[ContainerManager]: Failed to update QoS cgroup configuration")
return err
}
}
总结:
在根 cgroup kubepods 创建 besteffort burstable 设置 besteffort cpu share 为 2
Guaranteed 直接使用 kubepods 资源限制
定期间隔 1s 更新 cgroup,为 Burstable 设置 cpu share 值,更新 Burstable 和 besteffort 资源限额