代码作用
查找能够满足filter过滤插件的节点,返回结果有可能是0,1,N
// findNodesThatPassFilters finds the nodes that fit the filter plugins.
func (g *genericScheduler) findNodesThatPassFilters(ctx context.Context, prof *profile.Profile, state *framework.CycleState, pod *v1.Pod, statuses framework.NodeToStatusMap) ([]*v1.Node, error) {
allNodes, err := g.nodeInfoSnapshot.NodeInfos().List()
if err != nil {
return nil, err
}
numNodesToFind := g.numFeasibleNodesToFind(int32(len(allNodes)))
// Create filtered list with enough space to avoid growing it
// and allow assigning.
filtered := make([]*v1.Node, numNodesToFind)
if !prof.HasFilterPlugins() {
for i := range filtered {
filtered[i] = allNodes[i].Node()
}
g.nextStartNodeIndex = (g.nextStartNodeIndex + len(filtered)) % len(allNodes)
return filtered, nil
}
errCh := util.NewErrorChannel()
var statusesLock sync.Mutex
var filteredLen int32
ctx, cancel := context.WithCancel(ctx)
checkNode := func(i int) {
// We check the nodes starting from where we left off in the previous scheduling cycle,
// this is to make sure all nodes have the same chance of being examined across pods.
nodeInfo := allNodes[(g.nextStartNodeIndex+i)%len(allNodes)]
fits, status, err := g.podPassesFiltersOnNode(ctx, prof, state, pod, nodeInfo)
if err != nil {
errCh.SendErrorWithCancel(err, cancel)
return
}
if fits {
length := atomic.AddInt32(&filteredLen, 1)
if length > numNodesToFind {
cancel()
atomic.AddInt32(&filteredLen, -1)
} else {
filtered[length-1] = nodeInfo.Node()
}
} else {
statusesLock.Lock()
if !status.IsSuccess() {
statuses[nodeInfo.Node().Name] = status
}
statusesLock.Unlock()
}
}
beginCheckNode := time.Now()
statusCode := framework.Success
defer func() {
// We record Filter extension point latency here instead of in framework.go because framework.RunFilterPlugins
// function is called for each node, whereas we want to have an overall latency for all nodes per scheduling cycle.
// Note that this latency also includes latency for `addNominatedPods`, which calls framework.RunPreFilterAddPod.
metrics.FrameworkExtensionPointDuration.WithLabelValues(framework.Filter, statusCode.String()).Observe(metrics.SinceInSeconds(beginCheckNode))
}()
// Stops searching for more nodes once the configured number of feasible nodes
// are found.
workqueue.ParallelizeUntil(ctx, 16, len(allNodes), checkNode)
processedNodes := int(filteredLen) + len(statuses)
g.nextStartNodeIndex = (g.nextStartNodeIndex + processedNodes) % len(allNodes)
filtered = filtered[:filteredLen]
if err := errCh.ReceiveError(); err != nil {
statusCode = framework.Error
return nil, err
}
return filtered, nil
}
过程整理
- 通过g.nodeInfoSnapshot.NodeInfos().List()获取所有节点;
- g.numFeasibleNodesToFind 通过入参AllNodes,做一个数量过滤,但是发现合适的数量Node,即立刻返回。也就说,加入集群有100个节点,那么参与调度的可能只有几十台;
- 创建一个固定长度的数组,长度是g.numFeasibleNodesToFind返回的numNodesToFind;
- 到这里是比较简单了,喝口水,继续;
- 判断是不是存在FilterPlugins,
if !prof.HasFilterPlugins() {
for i := range filtered {
filtered[i] = allNodes[i].Node()
}
g.nextStartNodeIndex = (g.nextStartNodeIndex + len(filtered)) % len(allNodes)
return filtered, nil
}
上述代码判断如果不存在FilterPlugins,在直接通过遍历给filtered赋值。这里可能会有一个疑问,filter和allNodes会存在越界的风险吗?应该可以确切说,不会,因为filterd的长度是由numNodesToFind决定,numNodesToFind又是allNodes的子集或者等集。
6. g.nextStartNodeIndex作用
g.nextStartNodeIndex = (g.nextStartNodeIndex + len(filtered)) % len(allNodes)
这段代码是标记下一次调度时,filter的重新查找节点的索引起点。
7. 新建一个缓冲区为1的的channel
errCh := util.NewErrorChannel()
- 新建一个cancel的context,如果接收到cancel信号,routine退出。
ctx, cancel := context.WithCancel(ctx)
- checkNode, 执行了一个匿名方法,该方法的目的是针对每个节点做Filter检查,并且在podPassesFiltersOnNode方法中执行了RunFilterPlugins。最后通过podPassesFiltersOnNode拿到了
fits和status两个值。
fits是一个bool变量,意思是这个node适合还是不适合这个pod调度。
status是一个*framework.Status类型,数据结构如下:
type Status struct {
code Code
reasons []string
}
// Code is the Status code/type which is returned from plugins.
type Code int
// These are predefined codes used in a Status.
const (
// Success means that plugin ran correctly and found pod schedulable.
// NOTE: A nil status is also considered as "Success".
Success Code = iota
// Error is used for internal plugin errors, unexpected input, etc.
Error
// Unschedulable is used when a plugin finds a pod unschedulable. The scheduler might attempt to
// preempt other pods to get this pod scheduled. Use UnschedulableAndUnresolvable to make the
// scheduler skip preemption.
// The accompanying status message should explain why the pod is unschedulable.
Unschedulable
// UnschedulableAndUnresolvable is used when a (pre-)filter plugin finds a pod unschedulable and
// preemption would not change anything. Plugins should return Unschedulable if it is possible
// that the pod can get scheduled with preemption.
// The accompanying status message should explain why the pod is unschedulable.
UnschedulableAndUnresolvable
// Wait is used when a permit plugin finds a pod scheduling should wait.
Wait
// Skip is used when a bind plugin chooses to skip binding.
Skip
)
- 如果fits返回这个节点满足filter调度,看细节处理,如果numNodesToFind不大于length,则channel退出。
- 执行到workqueue.ParallelizeUntil,如果找到合适的节点,则对剩余的node不在执行寻找了,此时重置了nodeIndex(nextStartNodeIndex)
- 最后返回找到的节点的数量filtered,类型是[]*v1.Node