源码为k8s v1.6.1版本,github上对应的commit id为b0b7a323cc5a4a2019b2e9520c21c7830b7f708e
本文将对Scheduler的调度中Node信息的管理流程进行介绍,主要介绍Scheduler模块中node信息如何初始化,node信息如何与pod的更新信息同步,node信息如何用到调度算法中
一、Kubernetes Scheduler中Node信息创建
在Kubernetes中Scheduler中Node节点的信息,通过list-watch机制从api-server获取,同时每次调度完成后,又将结果同步到schedulecache对象nodeinfos中。再调度的算法进行时,在将schedulecache对象nodeinfos中的信息同步到genericScheduler对象的的cachedNodeInfoMap中。
Node信息的创建包括如下几个关键步骤:
(1) 创建Informer和Listen
在createScheduler-> factory.NewConfigFactory函数总,创建了nodeInformer和nodeLister对象,具体的代码如下:
// Only nodes in the "Ready" condition with status == "True" are schedulable
nodeInformer.Informer().AddEventHandlerWithResyncPeriod(
cache.ResourceEventHandlerFuncs{
AddFunc: c.addNodeToCache,
UpdateFunc: c.updateNodeInCache,
DeleteFunc: c.deleteNodeFromCache,
},
0,
)
c.nodeLister = nodeInformer.Lister()
(2) 在schedulercache对象中创建nodeinfo对象
type schedulerCache struct {
stop <-chan struct{}
ttl time.Duration
period time.Duration
// This mutex guards all fields within this cache struct.
mu sync.Mutex
// a set of assumed pod keys.
// The key could further be used to get an entry in podStates.
assumedPods map[string]bool
// a map from pod key to podState.
podStates map[string]*podState
nodes map[string]*NodeInfo
}
(3) 在genericScheduler对象中创建cachenodeinfo对象
type genericScheduler struct {
cache schedulercache.Cache
predicates map[string]algorithm.FitPredicate
priorityMetaProducer algorithm.MetadataProducer
predicateMetaProducer algorithm.MetadataProducer
prioritizers []algorithm.PriorityConfig
extenders []algorithm.SchedulerExtender
pods algorithm.PodLister
lastNodeIndexLock sync.Mutex
lastNodeIndex uint64
cachedNodeInfoMap map[string]*schedulercache.NodeInfo
equivalenceCache *EquivalenceCache
}
二、Kubernetes Scheduler中Node信息同步
在scheduleOne中每调度一个pod,则调度AssumePod函数,将Pod的信息同步到schedulerCache中,在同步pod信息的同时,同步修改对应的node信息。
func (cache *schedulerCache) AssumePod(pod *v1.Pod) error {
key, err := getPodKey(pod)
if err != nil {
return err
}
cache.mu.Lock()
defer cache.mu.Unlock()
if _, ok := cache.podStates[key]; ok {
return fmt.Errorf("pod %v state wasn't initial but get assumed", key)
}
cache.addPod(pod)
ps := &podState{
pod: pod,
}
cache.podStates[key] = ps
cache.assumedPods[key] = true
return nil
}
// Assumes that lock is already acquired.
func (cache *schedulerCache) addPod(pod *v1.Pod) {
n, ok := cache.nodes[pod.Spec.NodeName]
if !ok {
n = NewNodeInfo()
cache.nodes[pod.Spec.NodeName] = n
}
n.addPod(pod)
}
// addPod adds pod information to this NodeInfo.
func (n *NodeInfo) addPod(pod *v1.Pod) {
res, non0_cpu, non0_mem := calculateResource(pod)
n.requestedResource.MilliCPU += res.MilliCPU
n.requestedResource.Memory += res.Memory
n.requestedResource.NvidiaGPU += res.NvidiaGPU
if n.requestedResource.OpaqueIntResources == nil && len(res.OpaqueIntResources) > 0 {
n.requestedResource.OpaqueIntResources = map[v1.ResourceName]int64{}
}
for rName, rQuant := range res.OpaqueIntResources {
n.requestedResource.OpaqueIntResources[rName] += rQuant
}
n.nonzeroRequest.MilliCPU += non0_cpu
n.nonzeroRequest.Memory += non0_mem
n.pods = append(n.pods, pod)
if hasPodAffinityConstraints(pod) {
n.podsWithAffinity = append(n.podsWithAffinity, pod)
}
n.generation++
}
在Informer监听到有node信息变化时,会调用在初始化时传入的对应的函数,具体的函数如下:
AddFunc: c.addNodeToCache,
UpdateFunc: c.updateNodeInCache,
DeleteFunc: c.deleteNodeFromCache,
func (c *ConfigFactory) addNodeToCache(obj interface{}) {
node, ok := obj.(*v1.Node)
if !ok {
glog.Errorf("cannot convert to *v1.Node: %v", obj)
return
}
if err := c.schedulerCache.AddNode(node); err != nil {
glog.Errorf("scheduler cache AddNode failed: %v", err)
}
}
func (cache *schedulerCache) AddNode(node *v1.Node) error {
cache.mu.Lock()
defer cache.mu.Unlock()
n, ok := cache.nodes[node.Name]
if !ok {
n = NewNodeInfo()
cache.nodes[node.Name] = n
}
return n.SetNode(node)
}
三、Kubernetes Scheduler中Node信息在调度算法中的使用
Kubernetes Scheduler中每次调度,输入pod的信息和node节点的信息。其中node的节点的信息来源于两部分,一份来源于nodelisten另外一部分来源于schedulerCache中缓存的nodeinfo信息。
具体的流程如下
(1)通过nodelisten获取node的名称信息
dest, err := s.config.Algorithm.Schedule(pod, s.config.NodeLister)
trace.Step("Computing predicates")
filteredNodes, failedPredicateMap, err := findNodesThatFit(pod, g.cachedNodeInfoMap, nodes, g.predicates, g.extenders, g.predicateMetaProducer)
if err != nil {
return "", err
}
fmt.Printf("[%s] is fit, Unschedulable:%v ,\n",nodeName,nodeNameToInfo[nodeName].Node().Spec.Unschedulable)
fits, failedPredicates, err := podFitsOnNode(pod, meta, nodeNameToInfo[nodeName], predicateFuncs)
(2)通过同步schedulerCache中缓存的nodeinfo信息作为调度算法中node信息的输入
同步schedulerCache中缓存的nodeinfo信息,在genericScheduler对象的Schedule函数中:
// Used for all fit and priority funcs.
err = g.cache.UpdateNodeNameToInfoMap(g.cachedNodeInfoMap)
if err != nil {
return "", err
}
func (cache *schedulerCache) UpdateNodeNameToInfoMap(nodeNameToInfo map[string]*NodeInfo) error {
cache.mu.Lock()
defer cache.mu.Unlock()
for name, info := range cache.nodes {
if current, ok := nodeNameToInfo[name]; !ok || current.generation != info.generation {
nodeNameToInfo[name] = info.Clone()
}
}
for name := range nodeNameToInfo {
if _, ok := cache.nodes[name]; !ok {
delete(nodeNameToInfo, name)
}
}
return nil
}
在调度算法中使用:
trace.Step("Computing predicates")
filteredNodes, failedPredicateMap, err := findNodesThatFit(pod, g.cachedNodeInfoMap, nodes, g.predicates, g.extenders, g.predicateMetaProducer)
if err != nil {
return "", err
}
fmt.Printf("[%s] is fit, Unschedulable:%v ,\n",nodeName,nodeNameToInfo[nodeName].Node().Spec.Unschedulable)
fits, failedPredicates, err := podFitsOnNode(pod, meta, nodeNameToInfo[nodeName], predicateFuncs)
if err != nil {
predicateResultLock.Lock()
errs = append(errs, err)
predicateResultLock.Unlock()
return
}
这就是Kubernetes中schedule墨迹Node信息管理的基本的流程。主要涉及到node信息的同步和node信息的使用