kubernetes1.8 源码分析之资源调度

之前1.4的源码分析已经讲过调度的过程,现在载对一些细节补充一下。首先是k8s会加载哪些资源调度算法呢,其实它支持两种,一种是配置文件,一种是默认代码中指定的,下面看看plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go里面默认的调度算法,一个是筛选

func defaultPredicates() sets.String {
    return sets.NewString(
        // Fit is determined by volume zone requirements.
        factory.RegisterFitPredicateFactory(
            "NoVolumeZoneConflict",
            func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
                return predicates.NewVolumeZonePredicate(args.PVInfo, args.PVCInfo)
            },
        ),
        // Fit is determined by whether or not there would be too many AWS EBS volumes attached to the node
        factory.RegisterFitPredicateFactory(
            "MaxEBSVolumeCount",
            func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
                // TODO: allow for generically parameterized scheduler predicates, because this is a bit ugly
                maxVols := getMaxVols(aws.DefaultMaxEBSVolumes)
                return predicates.NewMaxPDVolumeCountPredicate(predicates.EBSVolumeFilter, maxVols, args.PVInfo, args.PVCInfo)
            },
        ),
        // Fit is determined by whether or not there would be too many GCE PD volumes attached to the node
        factory.RegisterFitPredicateFactory(
            "MaxGCEPDVolumeCount",
            func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
                // TODO: allow for generically parameterized scheduler predicates, because this is a bit ugly
                maxVols := getMaxVols(DefaultMaxGCEPDVolumes)
                return predicates.NewMaxPDVolumeCountPredicate(predicates.GCEPDVolumeFilter, maxVols, args.PVInfo, args.PVCInfo)
            },
        ),
        // Fit is determined by whether or not there would be too many Azure Disk volumes attached to the node
        factory.RegisterFitPredicateFactory(
            "MaxAzureDiskVolumeCount",
            func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
                // TODO: allow for generically parameterized scheduler predicates, because this is a bit ugly
                maxVols := getMaxVols(DefaultMaxAzureDiskVolumes)
                return predicates.NewMaxPDVolumeCountPredicate(predicates.AzureDiskVolumeFilter, maxVols, args.PVInfo, args.PVCInfo)
            },
        ),
        // Fit is determined by inter-pod affinity.
        factory.RegisterFitPredicateFactory(
            "MatchInterPodAffinity",
            func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
                return predicates.NewPodAffinityPredicate(args.NodeInfo, args.PodLister)
            },
        ),

        // Fit is determined by non-conflicting disk volumes.
        factory.RegisterFitPredicate("NoDiskConflict", predicates.NoDiskConflict),

        // GeneralPredicates are the predicates that are enforced by all Kubernetes components
        // (e.g. kubelet and all schedulers)
        factory.RegisterFitPredicate("GeneralPredicates", predicates.GeneralPredicates),

        // Fit is determined based on whether a pod can tolerate all of the node's taints
        factory.RegisterFitPredicate("PodToleratesNodeTaints", predicates.PodToleratesNodeTaints),

        // Fit is determined by node memory pressure condition.
        factory.RegisterFitPredicate("CheckNodeMemoryPressure", predicates.CheckNodeMemoryPressurePredicate),

        // Fit is determined by node disk pressure condition.
        factory.RegisterFitPredicate("CheckNodeDiskPressure", predicates.CheckNodeDiskPressurePredicate),

        // Fit is determined by node disk mount condition.
        factory.RegisterFitPredicate("CheckNodeDiskMountPressure", predicates.CheckNodeDiskMountPressurePredicate),

        // Fit is determined by volume zone requirements.
        factory.RegisterFitPredicateFactory(
            "NoVolumeNodeConflict",
            func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
                return predicates.NewVolumeNodePredicate(args.PVInfo, args.PVCInfo, nil)
            },
        ),
    )
}

筛选主要包括磁盘冲突、内存压力、节点状态,当然还包括节点端口标签选择等,
还有就是默认打分策略

func defaultPriorities() sets.String {
    return sets.NewString(
        // spreads pods by minimizing the number of pods (belonging to the same service or replication controller) on the same node.
        factory.RegisterPriorityConfigFactory(
            "SelectorSpreadPriority",
            factory.PriorityConfigFactory{
                Function: func(args factory.PluginFactoryArgs) algorithm.PriorityFunction {
                    return priorities.NewSelectorSpreadPriority(args.ServiceLister, args.ControllerLister, args.ReplicaSetLister, args.StatefulSetLister)
                },
                Weight: 1,
            },
        ),
        // pods should be placed in the same topological domain (e.g. same node, same rack, same zone, same power domain, etc.)
        // as some other pods, or, conversely, should not be placed in the same topological domain as some other pods.
        factory.RegisterPriorityConfigFactory(
            "InterPodAffinityPriority",
            factory.PriorityConfigFactory{
                Function: func(args factory.PluginFactoryArgs) algorithm.PriorityFunction {
                    return priorities.NewInterPodAffinityPriority(args.NodeInfo, args.NodeLister, args.PodLister, args.HardPodAffinitySymmetricWeight)
                },
                Weight: 1,
            },
        ),

        // Prioritize nodes by least requested utilization.
        factory.RegisterPriorityFunction2("LeastRequestedPriority", priorities.LeastRequestedPriorityMap, nil, 1),

        // Prioritizes nodes to help achieve balanced resource usage
        factory.RegisterPriorityFunction2("BalancedResourceAllocation", priorities.BalancedResourceAllocationMap, nil, 1),

        // Set this weight large enough to override all other priority functions.
        // TODO: Figure out a better way to do this, maybe at same time as fixing #24720.
        factory.RegisterPriorityFunction2("NodePreferAvoidPodsPriority", priorities.CalculateNodePreferAvoidPodsPriorityMap, nil, 10000),

        // Prioritizes nodes that have labels matching NodeAffinity
        factory.RegisterPriorityFunction2("NodeAffinityPriority", priorities.CalculateNodeAffinityPriorityMap, priorities.CalculateNodeAffinityPriorityReduce, 1),

        // TODO: explain what it does.
        factory.RegisterPriorityFunction2("TaintTolerationPriority", priorities.ComputeTaintTolerationPriorityMap, priorities.ComputeTaintTolerationPriorityReduce, 1),
    )
}

这个里面主要是一些资源优化调度保证资源平均使用以及标签和pod的亲和性关联等。当系统以及加载好这些过滤和打分的算法以后,那么当来了一个pod需要调度的时候,改怎么去调度呢?在说具体调度时候,先看看cache机制,这个和k8s其它组件里面的cache是一样的,避免频繁调用k8s 的apiserver。先看cache接口

type Cache interface {
    // AssumePod assumes a pod scheduled and aggregates the pod's information into its node.
    // The implementation also decides the policy to expire pod before being confirmed (receiving Add event).
    // After expiration, its information would be subtracted.
    AssumePod(pod *v1.Pod) error

    // FinishBinding signals that cache for assumed pod can be expired
    FinishBinding(pod *v1.Pod) error

    // ForgetPod removes an assumed pod from cache.
    ForgetPod(pod *v1.Pod) error

    // AddPod either confirms a pod if it's assumed, or adds it back if it's expired.
    // If added back, the pod's information would be added again.
    AddPod(pod *v1.Pod) error

    // UpdatePod removes oldPod's information and adds newPod's information.
    UpdatePod(oldPod, newPod *v1.Pod) error

    // RemovePod removes a pod. The pod's information would be subtracted from assigned node.
    RemovePod(pod *v1.Pod) error

    // AddNode adds overall information about node.
    AddNode(node *v1.Node) error

    // UpdateNode updates overall information about node.
    UpdateNode(oldNode, newNode *v1.Node) error

    // RemoveNode removes overall information about node.
    RemoveNode(node *v1.Node) error

    // UpdateNodeNameToInfoMap updates the passed infoMap to the current contents of Cache.
    // The node info contains aggregated information of pods scheduled (including assumed to be)
    // on this node.
    UpdateNodeNameToInfoMap(infoMap map[string]*NodeInfo) error

    // List lists all cached pods (including assumed ones).
    List(labels.Selector) ([]*v1.Pod, error)
}

包括了pod和node的一些方法,当watch这些资源变化时候就可以更新里面的数据,篇幅有限,那一个方法说一下,先看注册事件plugin/pkg/scheduler/factory/factory.go

    nodeInformer.Informer().AddEventHandlerWithResyncPeriod(
        cache.ResourceEventHandlerFuncs{
            AddFunc:    c.addNodeToCache,
            UpdateFunc: c.updateNodeInCache,
            DeleteFunc: c.deleteNodeFromCache,
        },
        0,
    )

上面的node add的事件将会出发cache的AddNode方法。

func (cache *schedulerCache) AddNode(node *v1.Node) error {
    cache.mu.Lock()
    defer cache.mu.Unlock()

    n, ok := cache.nodes[node.Name]
    if !ok {
        n = NewNodeInfo()
        cache.nodes[node.Name] = n
    }
    return n.SetNode(node)
}

那么cache里面就可以保存这个node信息了。通过处理保存到cache里面的数据

func (n *NodeInfo) SetNode(node *v1.Node) error {
    n.node = node
    for rName, rQuant := range node.Status.Allocatable {
        switch rName {
        case v1.ResourceCPU:
            n.allocatableResource.MilliCPU = rQuant.MilliValue()
        case v1.ResourceMemory:
            n.allocatableResource.Memory = rQuant.Value()
        case v1.ResourceNvidiaGPU:
            n.allocatableResource.NvidiaGPU = rQuant.Value()
        case v1.ResourcePods:
            n.allowedPodNumber = int(rQuant.Value())
        case v1.ResourceStorageScratch:
            n.allocatableResource.StorageScratch = rQuant.Value()
        case v1.ResourceStorageOverlay:
            n.allocatableResource.StorageOverlay = rQuant.Value()
        default:
            if v1helper.IsOpaqueIntResourceName(rName) {
                n.allocatableResource.SetOpaque(rName, rQuant.Value())
            }
        }
    }
    n.taints = node.Spec.Taints
    for i := range node.Status.Conditions {
        cond := &node.Status.Conditions[i]
        switch cond.Type {
        case v1.NodeMemoryPressure:
            n.memoryPressureCondition = cond.Status
        case v1.NodeDiskPressure:
            n.diskPressureCondition = cond.Status
        case v1.NodeDiskMountPressure:
            n.diskMountPressureCondition = cond.Status
        default:
            // We ignore other conditions.
        }
    }
    n.generation++
    return nil
}

当然处理缓存节点状态cache还缓存pod状态plugin/pkg/scheduler/schedulercache/cache.go

    podStates map[string]*podState
    nodes     map[string]*NodeInfo

说完了cache的机制后,我们回到上面,一个pod过来怎么样调度去bind呢。当一个pod需要被调度时候回执行
plugin/pkg/scheduler/core/generic_scheduler.go里面的Schedule

    err = g.cache.UpdateNodeNameToInfoMap(g.cachedNodeInfoMap)
    if err != nil {
        return "", err
    }

    trace.Step("Computing predicates")
    filteredNodes, failedPredicateMap, err := findNodesThatFit(pod, g.cachedNodeInfoMap, nodes, g.predicates, g.extenders, g.predicateMetaProducer, g.equivalenceCache)

这个里面有两个方法需要分析,第一是UpdateNodeNameToInfoMap它是更新被调度节点信息的,plugin/pkg/scheduler/schedulercache/cache.go

func (cache *schedulerCache) UpdateNodeNameToInfoMap(nodeNameToInfo map[string]*NodeInfo) error {
    cache.mu.Lock()
    defer cache.mu.Unlock()
    for name, info := range cache.nodes {
        if current, ok := nodeNameToInfo[name]; !ok || current.generation != info.generation {
            nodeNameToInfo[name] = info.Clone()
        }
    }
    for name := range nodeNameToInfo {
        if _, ok := cache.nodes[name]; !ok {
            delete(nodeNameToInfo, name)
        }
    }
    return nil
}

这个里面通过info.Clone()吧cache里面节点信息,复制给nodeNameToInfo这个调度时候使用的筛选节点。第二个要说的方法是重点,findNodesThatFit,这个是具体调度时候使用的,进入这个方法

fits, failedPredicates, err := podFitsOnNode(pod, meta, nodeNameToInfo[nodeName], predicateFuncs, ecache)

这个里面通过16协程并行去执行新节点检查,调用上面的podFitsOnNode方法去判读是否合适,而podFitsOnNode这个方法里面会遍历去判断筛选方法

    for predicateKey, predicate := range predicateFuncs {
        // If equivalenceCache is available
        if eCacheAvailable {
            // PredicateWithECache will returns it's cached predicate results
            fit, reasons, invalid = ecache.PredicateWithECache(pod, info.Node().GetName(), predicateKey, equivalenceHash)
        }

        if !eCacheAvailable || invalid {
            // we need to execute predicate functions since equivalence cache does not work
            fit, reasons, err = predicate(pod, meta, info)
            if err != nil {
                return false, []algorithm.PredicateFailureReason{}, err
            }

            if eCacheAvailable {
                // update equivalence cache with newly computed fit & reasons
                // TODO(resouer) should we do this in another thread? any race?
                ecache.UpdateCachedPredicateItem(pod, info.Node().GetName(), predicateKey, fit, reasons, equivalenceHash)
            }
        }

        if !fit {
            // eCache is available and valid, and predicates result is unfit, record the fail reasons
            failedPredicates = append(failedPredicates, reasons...)
        }
    }

通过for循环逐一执行predicate这个函数指针。这个函数就是之前注册的筛选函数,通过返回fit是否为ture来判断是和合适,如果不合适,通过第二参数reasons去获取失败的原因。看一个最简单的磁盘压力的筛选函数

func CheckNodeDiskPressurePredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
    // is node under presure?
    if nodeInfo.DiskPressureCondition() == v1.ConditionTrue {
        return false, []algorithm.PredicateFailureReason{ErrNodeUnderDiskPressure}, nil
    }
    return true, nil, nil
}

就是上面说的返回fit是否合适和reason。好了,整个筛选的流程走完了!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

柳清风09

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值