kube-scheduler SchedulingQueue

本文看一下调度队列的实现,调度队列用来保存等待调度的pod,其必须实现下面的接口

type SchedulingQueue interface {
	//和抢占相关,暂时先忽略
	framework.PodNominator
	//将新创建的pod加入调度队列activeQ
	Add(pod *v1.Pod) error
	//将指定的pod移动到调度队列activeQ
	Activate(pods map[string]*v1.Pod)
	//将调度失败的pod加入不可调度队列unschedulableQ
	AddUnschedulableIfNotPresent(pod *framework.QueuedPodInfo, podSchedulingCycle int64) error
	//返回变量schedulingCycle的值,每次pop一个pod此值加1
	SchedulingCycle() int64
	//从调度队列activeQ中弹出一个待调度的pod,如果activeQ队列为空,则堵塞等待
	Pop() (*framework.QueuedPodInfo, error)
	//更新pod
	Update(oldPod, newPod *v1.Pod) error
	//将pod从队列中删除。pod只可能在其中一个队列中,调用delete后,所有队列都不会再有此pod
	Delete(pod *v1.Pod) error
	//将不可调度队列unschedulableQ中的pod移动到activeQ或者backoffQ
	MoveAllToActiveOrBackoffQueue(event framework.ClusterEvent, preCheck PreEnqueueCheck)
	//将不可调度队列unschedulableQ中和参数pod有亲和性的pod移动到activeQ或者backoffQ
	//当pod bind到node成功后,那些和此pod有亲和性的pod有可能被调度成功
	AssignedPodAdded(pod *v1.Pod)
	AssignedPodUpdated(pod *v1.Pod)
	//返回三个队列中所有的pod,用作测试
	PendingPods() []*v1.Pod
	//关闭调度队列
	Close()
	//启动两个协程,周期性将不可调度pod移动到activeQ
	Run()
}

PriorityQueue是调度队列的实现,其实现了SchedulingQueue的接口

type PriorityQueue struct {
	//和抢占相关,暂时先忽略
	framework.PodNominator

	stop  chan struct{}
	clock util.Clock

	//pod在backoff队列中的初始退避时间,第一次为podInitialBackoffDuration,第二次为podInitialBackoffDuration*2,以此类推
	podInitialBackoffDuration time.Duration
	/pod在backoff队列中的最大退避时间,pod可能会多次调度失败,退避时间为上次的2倍,但最大值为podMaxBackoffDuration
	podMaxBackoffDuration time.Duration

	lock sync.RWMutex
	cond sync.Cond

	//等待调度队列,有序队列,使用创建时传入的lessFn进行排序
	activeQ *heap.Heap
	// 退避队列,有序队列,使用podsCompareBackoffCompleted进行排序。pod完成退避后,移动到activeQ
	podBackoffQ *heap.Heap
	//不可调度队列,调度失败的pod,有资源变化或超时60s,将pod移动到activeQ或podBackoffQ 
	unschedulableQ *UnschedulablePodsMap
	//调度序列号,每次pod从activeQ被弹出调度,此值加1
	schedulingCycle int64
	//当有移动pod请求发生时,此值被赋值为schedulingCycle,可参考函数movePodsToActiveOrBackoffQueue
	moveRequestCycle int64

	clusterEventMap map[framework.ClusterEvent]sets.String

	// closed indicates that the queue is closed.
	// It is mainly used to let Pop() exit its control loop while waiting for an item.
	closed bool

	nsLister listersv1.NamespaceLister
}

三个队列保存的是QueuedPodInfo,此结构除了基本的pod信息外,还增加了和调度相关的几个变量

type QueuedPodInfo struct {
	//pod基本信息
	*PodInfo
	//pod加入到activeQ的时间,用来计算退避时间
	Timestamp time.Time
	//pod被调度的次数
	Attempts int
	//pod第一次被加入调度队列的时间,可以用来计算调度一个pod所用的总时间
	InitialAttemptTimestamp time.Time
	//记录pod在哪个插件失败
	UnschedulablePlugins sets.String
}

下面看一下接口的具体实现
Add
将新创建的pod加入调度队列activeQ

func (p *PriorityQueue) Add(pod *v1.Pod) error {
	p.lock.Lock()
	defer p.lock.Unlock()
	//将pod转换成QueuedPodInfo
	pInfo := p.newQueuedPodInfo(pod)
	//将pod加入activeQ
	if err := p.activeQ.Add(pInfo); err != nil {
		klog.ErrorS(err, "Error adding pod to the active queue", "pod", klog.KObj(pod))
		return err
	}
	//如果pod已经在unschedulableQ队列,则将其从中删除
	if p.unschedulableQ.get(pod) != nil {
		klog.ErrorS(nil, "Error: pod is already in the unschedulable queue", "pod", klog.KObj(pod))
		p.unschedulableQ.delete(pod)
	}
	//如果pod已经在podBackoffQ队列,则将其从中删除
	// Delete pod from backoffQ if it is backing off
	if err := p.podBackoffQ.Delete(pInfo); err == nil {
		klog.ErrorS(nil, "Error: pod is already in the podBackoff queue", "pod", klog.KObj(pod))
	}
	metrics.SchedulerQueueIncomingPods.WithLabelValues("active", PodAdd).Inc()
	p.PodNominator.AddNominatedPod(pInfo.PodInfo, "")
	//广播消息,唤起正在堵塞等待pod的协程,比如调用MakeNextPodFunc的协程
	p.cond.Broadcast()

	return nil
}

AddUnschedulableIfNotPresent
将调度失败的pod加入队列,通常情况下,将pod加入不可调度队列unschedulableQ,但是如果最近有moverequest发生,则将pod加入退避队列podBackoffQ

func (p *PriorityQueue) AddUnschedulableIfNotPresent(pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error {
	p.lock.Lock()
	defer p.lock.Unlock()
	pod := pInfo.Pod
	//pod已经在不可调度队列unschedulableQ,返回错误
	if p.unschedulableQ.get(pod) != nil {
		return fmt.Errorf("Pod %v is already present in unschedulable queue", klog.KObj(pod))
	}
	//pod已经在待调度队列activeQ,返回错误
	if _, exists, _ := p.activeQ.Get(pInfo); exists {
		return fmt.Errorf("Pod %v is already present in the active queue", klog.KObj(pod))
	}
	//pod已经在退避队列podBackoffQ,返回错误
	if _, exists, _ := p.podBackoffQ.Get(pInfo); exists {
		return fmt.Errorf("Pod %v is already present in the backoff queue", klog.KObj(pod))
	}

	//更新时间戳,保存被加入调度队列的时间,用来计算退避时间
	// Refresh the timestamp since the pod is re-added.
	pInfo.Timestamp = p.clock.Now()

	//最近有moverequest发生,则将pod加入退避队列podBackoffQ,否则加入不可调度队列unschedulableQ
	if p.moveRequestCycle >= podSchedulingCycle {
		if err := p.podBackoffQ.Add(pInfo); err != nil {
			return fmt.Errorf("error adding pod %v to the backoff queue: %v", pod.Name, err)
		}
		metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", ScheduleAttemptFailure).Inc()
	} else {
		p.unschedulableQ.addOrUpdate(pInfo)
		metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", ScheduleAttemptFailure).Inc()
	}

	p.PodNominator.AddNominatedPod(pInfo.PodInfo, "")
	return nil
}

Pop
从调度队列activeQ中弹出一个待调度的pod,如果activeQ队列为空,则堵塞等待

// Pop removes the head of the active queue and returns it. It blocks if the
// activeQ is empty and waits until a new item is added to the queue. It
// increments scheduling cycle when a pod is popped.
func (p *PriorityQueue) Pop() (*framework.QueuedPodInfo, error) {
	p.lock.Lock()
	defer p.lock.Unlock()
	//如果待调度队列activeQ为空,并且closed为false,则堵塞等待pod加入队列
	for p.activeQ.Len() == 0 {
		// When the queue is empty, invocation of Pop() is blocked until new item is enqueued.
		// When Close() is called, the p.closed is set and the condition is broadcast,
		// which causes this loop to continue and return from the Pop().
		if p.closed {
			return nil, fmt.Errorf(queueClosed)
		}
		p.cond.Wait()
	}
	//弹出一个pod
	obj, err := p.activeQ.Pop()
	if err != nil {
		return nil, err
	}
	pInfo := obj.(*framework.QueuedPodInfo)
	//尝试次数加1
	pInfo.Attempts++
	//调度序列号加1
	p.schedulingCycle++
	return pInfo, err
}

Delete
将pod从队列中删除。pod只可能在其中一个队列中,调用delete后,所有队列都不会再有此pod

func (p *PriorityQueue) Delete(pod *v1.Pod) error {
	p.lock.Lock()
	defer p.lock.Unlock()
	p.PodNominator.DeleteNominatedPodIfExists(pod)
	//将pod从activeQ队列删除,如果不存在,则尝试从podBackoffQ和unschedulableQ队列删除
	if err := p.activeQ.Delete(newQueuedPodInfoForLookup(pod)); err != nil {
		// The item was probably not found in the activeQ.
		p.podBackoffQ.Delete(newQueuedPodInfoForLookup(pod))
		p.unschedulableQ.delete(pod)
	}
	return nil
}

MoveAllToActiveOrBackoffQueue
将不可调度队列unschedulableQ中的pod移动到activeQ或者backoffQ

func (p *PriorityQueue) MoveAllToActiveOrBackoffQueue(event framework.ClusterEvent, preCheck PreEnqueueCheck) {
	p.lock.Lock()
	defer p.lock.Unlock()
	unschedulablePods := make([]*framework.QueuedPodInfo, 0, len(p.unschedulableQ.podInfoMap))
	for _, pInfo := range p.unschedulableQ.podInfoMap {
		if preCheck == nil || preCheck(pInfo.Pod) {
			unschedulablePods = append(unschedulablePods, pInfo)
		}
	}
	p.movePodsToActiveOrBackoffQueue(unschedulablePods, event)
}

// NOTE: this function assumes lock has been acquired in caller
func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(podInfoList []*framework.QueuedPodInfo, event framework.ClusterEvent) {
	moved := false
	for _, pInfo := range podInfoList {
		// If the event doesn't help making the Pod schedulable, continue.
		// Note: we don't run the check if pInfo.UnschedulablePlugins is nil, which denotes
		// either there is some abnormal error, or scheduling the pod failed by plugins other than PreFilter, Filter and Permit.
		// In that case, it's desired to move it anyways.
		if len(pInfo.UnschedulablePlugins) != 0 && !p.podMatchesEvent(pInfo, event) {
			continue
		}
		moved = true
		pod := pInfo.Pod
		//如果pod还在退避时间内,则将pod加入到podBackoffQ队列,否则加入到activeQ队列
		if p.isPodBackingoff(pInfo) {
			if err := p.podBackoffQ.Add(pInfo); err != nil {
				klog.ErrorS(err, "Error adding pod to the backoff queue", "pod", klog.KObj(pod))
			} else {
				metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event.Label).Inc()
				p.unschedulableQ.delete(pod)
			}
		} else {
			if err := p.activeQ.Add(pInfo); err != nil {
				klog.ErrorS(err, "Error adding pod to the scheduling queue", "pod", klog.KObj(pod))
			} else {
				metrics.SchedulerQueueIncomingPods.WithLabelValues("active", event.Label).Inc()
				p.unschedulableQ.delete(pod)
			}
		}
	}
	p.moveRequestCycle = p.schedulingCycle
	if moved {
		p.cond.Broadcast()
	}
}

AssignedPodAdded
将不可调度队列unschedulableQ中和参数pod有亲和性的pod移动到activeQ或者backoffQ,当pod bind到node成功后,那些和此pod有亲和性的pod有可能被调度成功,所以需要尽快调度

// AssignedPodAdded is called when a bound pod is added. Creation of this pod
// may make pending pods with matching affinity terms schedulable.
func (p *PriorityQueue) AssignedPodAdded(pod *v1.Pod) {
	p.lock.Lock()
	//通过getUnschedulablePodsWithMatchingAffinityTerm获取和此pod有亲和性的pod列表
	p.movePodsToActiveOrBackoffQueue(p.getUnschedulablePodsWithMatchingAffinityTerm(pod), AssignedPodAdd)
	p.lock.Unlock()
}

Run
启动两个协程,执行周期性业务

// Run starts the goroutine to pump from podBackoffQ to activeQ
func (p *PriorityQueue) Run() {
	go wait.Until(p.flushBackoffQCompleted, 1.0*time.Second, p.stop)
	go wait.Until(p.flushUnschedulableQLeftover, 30*time.Second, p.stop)
}

每隔1秒执行一次flushBackoffQCompleted,此函数目的是将backoff队列中所有完成退避时间的pod移动到activeQ队列

// flushBackoffQCompleted Moves all pods from backoffQ which have completed backoff in to activeQ
func (p *PriorityQueue) flushBackoffQCompleted() {
	p.lock.Lock()
	defer p.lock.Unlock()
	for {
		//获取podBackoffQ的第一个pod,这里只是获取,还没有将其从队列pop
		rawPodInfo := p.podBackoffQ.Peek()
		if rawPodInfo == nil {
			return
		}
		pod := rawPodInfo.(*framework.QueuedPodInfo).Pod
		//获取退避时间
		boTime := p.getBackoffTime(rawPodInfo.(*framework.QueuedPodInfo))
		//如果退避时间还没到,则直接退出函数,因为podBackoffQ是按照退避时间排序的,如果此pod还未到退避时间,后面的肯定也未到
		if boTime.After(p.clock.Now()) {
			return
		}
		//退避完成,将其从队列pop出来
		_, err := p.podBackoffQ.Pop()
		if err != nil {
			klog.ErrorS(err, "Unable to pop pod from backoff queue despite backoff completion", "pod", klog.KObj(pod))
			return
		}
		//将pod加入activeQ等待调度
		p.activeQ.Add(rawPodInfo)
		metrics.SchedulerQueueIncomingPods.WithLabelValues("active", BackoffComplete).Inc()
		//发送广播唤起堵塞协程
		defer p.cond.Broadcast()
	}
}

每隔30s执行一次flushUnschedulableQLeftover,将在unschedulableQ队列超过60s的pod移动到backoffQ or activeQ

// flushUnschedulableQLeftover moves pods which stay in unschedulableQ longer than unschedulableQTimeInterval
// to backoffQ or activeQ.
func (p *PriorityQueue) flushUnschedulableQLeftover() {
	p.lock.Lock()
	defer p.lock.Unlock()

	var podsToMove []*framework.QueuedPodInfo
	currentTime := p.clock.Now()
	for _, pInfo := range p.unschedulableQ.podInfoMap {
		lastScheduleTime := pInfo.Timestamp
		//unschedulableQTimeInterval = 60 * time.Second
		if currentTime.Sub(lastScheduleTime) > unschedulableQTimeInterval {
			podsToMove = append(podsToMove, pInfo)
		}
	}

	if len(podsToMove) > 0 {
		p.movePodsToActiveOrBackoffQueue(podsToMove, UnschedulableTimeout)
	}
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值