本文看一下调度队列的实现,调度队列用来保存等待调度的pod,其必须实现下面的接口
type SchedulingQueue interface {
//和抢占相关,暂时先忽略
framework.PodNominator
//将新创建的pod加入调度队列activeQ
Add(pod *v1.Pod) error
//将指定的pod移动到调度队列activeQ
Activate(pods map[string]*v1.Pod)
//将调度失败的pod加入不可调度队列unschedulableQ
AddUnschedulableIfNotPresent(pod *framework.QueuedPodInfo, podSchedulingCycle int64) error
//返回变量schedulingCycle的值,每次pop一个pod此值加1
SchedulingCycle() int64
//从调度队列activeQ中弹出一个待调度的pod,如果activeQ队列为空,则堵塞等待
Pop() (*framework.QueuedPodInfo, error)
//更新pod
Update(oldPod, newPod *v1.Pod) error
//将pod从队列中删除。pod只可能在其中一个队列中,调用delete后,所有队列都不会再有此pod
Delete(pod *v1.Pod) error
//将不可调度队列unschedulableQ中的pod移动到activeQ或者backoffQ
MoveAllToActiveOrBackoffQueue(event framework.ClusterEvent, preCheck PreEnqueueCheck)
//将不可调度队列unschedulableQ中和参数pod有亲和性的pod移动到activeQ或者backoffQ
//当pod bind到node成功后,那些和此pod有亲和性的pod有可能被调度成功
AssignedPodAdded(pod *v1.Pod)
AssignedPodUpdated(pod *v1.Pod)
//返回三个队列中所有的pod,用作测试
PendingPods() []*v1.Pod
//关闭调度队列
Close()
//启动两个协程,周期性将不可调度pod移动到activeQ
Run()
}
PriorityQueue是调度队列的实现,其实现了SchedulingQueue的接口
type PriorityQueue struct {
//和抢占相关,暂时先忽略
framework.PodNominator
stop chan struct{}
clock util.Clock
//pod在backoff队列中的初始退避时间,第一次为podInitialBackoffDuration,第二次为podInitialBackoffDuration*2,以此类推
podInitialBackoffDuration time.Duration
/pod在backoff队列中的最大退避时间,pod可能会多次调度失败,退避时间为上次的2倍,但最大值为podMaxBackoffDuration
podMaxBackoffDuration time.Duration
lock sync.RWMutex
cond sync.Cond
//等待调度队列,有序队列,使用创建时传入的lessFn进行排序
activeQ *heap.Heap
// 退避队列,有序队列,使用podsCompareBackoffCompleted进行排序。pod完成退避后,移动到activeQ
podBackoffQ *heap.Heap
//不可调度队列,调度失败的pod,有资源变化或超时60s,将pod移动到activeQ或podBackoffQ
unschedulableQ *UnschedulablePodsMap
//调度序列号,每次pod从activeQ被弹出调度,此值加1
schedulingCycle int64
//当有移动pod请求发生时,此值被赋值为schedulingCycle,可参考函数movePodsToActiveOrBackoffQueue
moveRequestCycle int64
clusterEventMap map[framework.ClusterEvent]sets.String
// closed indicates that the queue is closed.
// It is mainly used to let Pop() exit its control loop while waiting for an item.
closed bool
nsLister listersv1.NamespaceLister
}
三个队列保存的是QueuedPodInfo,此结构除了基本的pod信息外,还增加了和调度相关的几个变量
type QueuedPodInfo struct {
//pod基本信息
*PodInfo
//pod加入到activeQ的时间,用来计算退避时间
Timestamp time.Time
//pod被调度的次数
Attempts int
//pod第一次被加入调度队列的时间,可以用来计算调度一个pod所用的总时间
InitialAttemptTimestamp time.Time
//记录pod在哪个插件失败
UnschedulablePlugins sets.String
}
下面看一下接口的具体实现
Add
将新创建的pod加入调度队列activeQ
func (p *PriorityQueue) Add(pod *v1.Pod) error {
p.lock.Lock()
defer p.lock.Unlock()
//将pod转换成QueuedPodInfo
pInfo := p.newQueuedPodInfo(pod)
//将pod加入activeQ
if err := p.activeQ.Add(pInfo); err != nil {
klog.ErrorS(err, "Error adding pod to the active queue", "pod", klog.KObj(pod))
return err
}
//如果pod已经在unschedulableQ队列,则将其从中删除
if p.unschedulableQ.get(pod) != nil {
klog.ErrorS(nil, "Error: pod is already in the unschedulable queue", "pod", klog.KObj(pod))
p.unschedulableQ.delete(pod)
}
//如果pod已经在podBackoffQ队列,则将其从中删除
// Delete pod from backoffQ if it is backing off
if err := p.podBackoffQ.Delete(pInfo); err == nil {
klog.ErrorS(nil, "Error: pod is already in the podBackoff queue", "pod", klog.KObj(pod))
}
metrics.SchedulerQueueIncomingPods.WithLabelValues("active", PodAdd).Inc()
p.PodNominator.AddNominatedPod(pInfo.PodInfo, "")
//广播消息,唤起正在堵塞等待pod的协程,比如调用MakeNextPodFunc的协程
p.cond.Broadcast()
return nil
}
AddUnschedulableIfNotPresent
将调度失败的pod加入队列,通常情况下,将pod加入不可调度队列unschedulableQ,但是如果最近有moverequest发生,则将pod加入退避队列podBackoffQ
func (p *PriorityQueue) AddUnschedulableIfNotPresent(pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error {
p.lock.Lock()
defer p.lock.Unlock()
pod := pInfo.Pod
//pod已经在不可调度队列unschedulableQ,返回错误
if p.unschedulableQ.get(pod) != nil {
return fmt.Errorf("Pod %v is already present in unschedulable queue", klog.KObj(pod))
}
//pod已经在待调度队列activeQ,返回错误
if _, exists, _ := p.activeQ.Get(pInfo); exists {
return fmt.Errorf("Pod %v is already present in the active queue", klog.KObj(pod))
}
//pod已经在退避队列podBackoffQ,返回错误
if _, exists, _ := p.podBackoffQ.Get(pInfo); exists {
return fmt.Errorf("Pod %v is already present in the backoff queue", klog.KObj(pod))
}
//更新时间戳,保存被加入调度队列的时间,用来计算退避时间
// Refresh the timestamp since the pod is re-added.
pInfo.Timestamp = p.clock.Now()
//最近有moverequest发生,则将pod加入退避队列podBackoffQ,否则加入不可调度队列unschedulableQ
if p.moveRequestCycle >= podSchedulingCycle {
if err := p.podBackoffQ.Add(pInfo); err != nil {
return fmt.Errorf("error adding pod %v to the backoff queue: %v", pod.Name, err)
}
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", ScheduleAttemptFailure).Inc()
} else {
p.unschedulableQ.addOrUpdate(pInfo)
metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", ScheduleAttemptFailure).Inc()
}
p.PodNominator.AddNominatedPod(pInfo.PodInfo, "")
return nil
}
Pop
从调度队列activeQ中弹出一个待调度的pod,如果activeQ队列为空,则堵塞等待
// Pop removes the head of the active queue and returns it. It blocks if the
// activeQ is empty and waits until a new item is added to the queue. It
// increments scheduling cycle when a pod is popped.
func (p *PriorityQueue) Pop() (*framework.QueuedPodInfo, error) {
p.lock.Lock()
defer p.lock.Unlock()
//如果待调度队列activeQ为空,并且closed为false,则堵塞等待pod加入队列
for p.activeQ.Len() == 0 {
// When the queue is empty, invocation of Pop() is blocked until new item is enqueued.
// When Close() is called, the p.closed is set and the condition is broadcast,
// which causes this loop to continue and return from the Pop().
if p.closed {
return nil, fmt.Errorf(queueClosed)
}
p.cond.Wait()
}
//弹出一个pod
obj, err := p.activeQ.Pop()
if err != nil {
return nil, err
}
pInfo := obj.(*framework.QueuedPodInfo)
//尝试次数加1
pInfo.Attempts++
//调度序列号加1
p.schedulingCycle++
return pInfo, err
}
Delete
将pod从队列中删除。pod只可能在其中一个队列中,调用delete后,所有队列都不会再有此pod
func (p *PriorityQueue) Delete(pod *v1.Pod) error {
p.lock.Lock()
defer p.lock.Unlock()
p.PodNominator.DeleteNominatedPodIfExists(pod)
//将pod从activeQ队列删除,如果不存在,则尝试从podBackoffQ和unschedulableQ队列删除
if err := p.activeQ.Delete(newQueuedPodInfoForLookup(pod)); err != nil {
// The item was probably not found in the activeQ.
p.podBackoffQ.Delete(newQueuedPodInfoForLookup(pod))
p.unschedulableQ.delete(pod)
}
return nil
}
MoveAllToActiveOrBackoffQueue
将不可调度队列unschedulableQ中的pod移动到activeQ或者backoffQ
func (p *PriorityQueue) MoveAllToActiveOrBackoffQueue(event framework.ClusterEvent, preCheck PreEnqueueCheck) {
p.lock.Lock()
defer p.lock.Unlock()
unschedulablePods := make([]*framework.QueuedPodInfo, 0, len(p.unschedulableQ.podInfoMap))
for _, pInfo := range p.unschedulableQ.podInfoMap {
if preCheck == nil || preCheck(pInfo.Pod) {
unschedulablePods = append(unschedulablePods, pInfo)
}
}
p.movePodsToActiveOrBackoffQueue(unschedulablePods, event)
}
// NOTE: this function assumes lock has been acquired in caller
func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(podInfoList []*framework.QueuedPodInfo, event framework.ClusterEvent) {
moved := false
for _, pInfo := range podInfoList {
// If the event doesn't help making the Pod schedulable, continue.
// Note: we don't run the check if pInfo.UnschedulablePlugins is nil, which denotes
// either there is some abnormal error, or scheduling the pod failed by plugins other than PreFilter, Filter and Permit.
// In that case, it's desired to move it anyways.
if len(pInfo.UnschedulablePlugins) != 0 && !p.podMatchesEvent(pInfo, event) {
continue
}
moved = true
pod := pInfo.Pod
//如果pod还在退避时间内,则将pod加入到podBackoffQ队列,否则加入到activeQ队列
if p.isPodBackingoff(pInfo) {
if err := p.podBackoffQ.Add(pInfo); err != nil {
klog.ErrorS(err, "Error adding pod to the backoff queue", "pod", klog.KObj(pod))
} else {
metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event.Label).Inc()
p.unschedulableQ.delete(pod)
}
} else {
if err := p.activeQ.Add(pInfo); err != nil {
klog.ErrorS(err, "Error adding pod to the scheduling queue", "pod", klog.KObj(pod))
} else {
metrics.SchedulerQueueIncomingPods.WithLabelValues("active", event.Label).Inc()
p.unschedulableQ.delete(pod)
}
}
}
p.moveRequestCycle = p.schedulingCycle
if moved {
p.cond.Broadcast()
}
}
AssignedPodAdded
将不可调度队列unschedulableQ中和参数pod有亲和性的pod移动到activeQ或者backoffQ,当pod bind到node成功后,那些和此pod有亲和性的pod有可能被调度成功,所以需要尽快调度
// AssignedPodAdded is called when a bound pod is added. Creation of this pod
// may make pending pods with matching affinity terms schedulable.
func (p *PriorityQueue) AssignedPodAdded(pod *v1.Pod) {
p.lock.Lock()
//通过getUnschedulablePodsWithMatchingAffinityTerm获取和此pod有亲和性的pod列表
p.movePodsToActiveOrBackoffQueue(p.getUnschedulablePodsWithMatchingAffinityTerm(pod), AssignedPodAdd)
p.lock.Unlock()
}
Run
启动两个协程,执行周期性业务
// Run starts the goroutine to pump from podBackoffQ to activeQ
func (p *PriorityQueue) Run() {
go wait.Until(p.flushBackoffQCompleted, 1.0*time.Second, p.stop)
go wait.Until(p.flushUnschedulableQLeftover, 30*time.Second, p.stop)
}
每隔1秒执行一次flushBackoffQCompleted,此函数目的是将backoff队列中所有完成退避时间的pod移动到activeQ队列
// flushBackoffQCompleted Moves all pods from backoffQ which have completed backoff in to activeQ
func (p *PriorityQueue) flushBackoffQCompleted() {
p.lock.Lock()
defer p.lock.Unlock()
for {
//获取podBackoffQ的第一个pod,这里只是获取,还没有将其从队列pop
rawPodInfo := p.podBackoffQ.Peek()
if rawPodInfo == nil {
return
}
pod := rawPodInfo.(*framework.QueuedPodInfo).Pod
//获取退避时间
boTime := p.getBackoffTime(rawPodInfo.(*framework.QueuedPodInfo))
//如果退避时间还没到,则直接退出函数,因为podBackoffQ是按照退避时间排序的,如果此pod还未到退避时间,后面的肯定也未到
if boTime.After(p.clock.Now()) {
return
}
//退避完成,将其从队列pop出来
_, err := p.podBackoffQ.Pop()
if err != nil {
klog.ErrorS(err, "Unable to pop pod from backoff queue despite backoff completion", "pod", klog.KObj(pod))
return
}
//将pod加入activeQ等待调度
p.activeQ.Add(rawPodInfo)
metrics.SchedulerQueueIncomingPods.WithLabelValues("active", BackoffComplete).Inc()
//发送广播唤起堵塞协程
defer p.cond.Broadcast()
}
}
每隔30s执行一次flushUnschedulableQLeftover,将在unschedulableQ队列超过60s的pod移动到backoffQ or activeQ
// flushUnschedulableQLeftover moves pods which stay in unschedulableQ longer than unschedulableQTimeInterval
// to backoffQ or activeQ.
func (p *PriorityQueue) flushUnschedulableQLeftover() {
p.lock.Lock()
defer p.lock.Unlock()
var podsToMove []*framework.QueuedPodInfo
currentTime := p.clock.Now()
for _, pInfo := range p.unschedulableQ.podInfoMap {
lastScheduleTime := pInfo.Timestamp
//unschedulableQTimeInterval = 60 * time.Second
if currentTime.Sub(lastScheduleTime) > unschedulableQTimeInterval {
podsToMove = append(podsToMove, pInfo)
}
}
if len(podsToMove) > 0 {
p.movePodsToActiveOrBackoffQueue(podsToMove, UnschedulableTimeout)
}
}