继续上文对Scheduler的分析,分析在Scheduler主循环处理过程中,podQueue,Queue和assumePod 三个队列的处理。
Scheduler中SchedulerOne为主要的处理函数,其源代码为
func (s *Scheduler) scheduleOne() {
pod := s.config.NextPod()
if pod.DeletionTimestamp != nil {
s.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
glog.V(3).Infof("Skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
return
}
glog.V(3).Infof("Attempting to schedule pod: %v/%v", pod.Namespace, pod.Name)
start := time.Now()
dest, err := s.config.Algorithm.Schedule(pod, s.config.NodeLister)
if err != nil {
glog.V(1).Infof("Failed to schedule pod: %v/%v", pod.Namespace, pod.Name)
s.config.Error(pod, err)
s.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "%v", err)
s.config.PodConditionUpdater.Update(pod, &v1.PodCondition{
Type: v1.PodScheduled,
Status: v1.ConditionFalse,
Reason: v1.PodReasonUnschedulable,
Message: err.Error(),
})
return
}
metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInMicroseconds(start))
// Optimistically assume that the binding will succeed and send it to apiserver
// in the background.
// If the binding fails, scheduler will release resources allocated to assumed pod
// immediately.
assumed := *pod
assumed.Spec.NodeName = dest
if err := s.config.SchedulerCache.AssumePod(&assumed); err != nil {
glog.Errorf("scheduler cache AssumePod failed: %v", err)
// TODO: This means that a given pod is already in cache (which means it
// is either assumed or already added). This is most probably result of a
// BUG in retrying logic. As a temporary workaround (which doesn't fully
// fix the problem, but should reduce its impact), we simply return here,
// as binding doesn't make sense anyway.
// This should be fixed properly though.
return
}
go func() {
defer metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start))
b := &v1.Binding{
ObjectMeta: metav1.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name},
Target: v1.ObjectReference{
Kind: "Node",
Name: dest,
},
}
bindingStart := time.Now()
// If binding succeeded then PodScheduled condition will be updated in apiserver so that
// it's atomic with setting host.
err := s.config.Binder.Bind(b)
if err := s.config.SchedulerCache.FinishBinding(&assumed); err != nil {
glog.Errorf("scheduler cache FinishBinding failed: %v", err)
}
if err != nil {
glog.V(1).Infof("Failed to bind pod: %v/%v", pod.Namespace, pod.Name)
if err := s.config.SchedulerCache.ForgetPod(&assumed); err != nil {
glog.Errorf("scheduler cache ForgetPod failed: %v", err)
}
s.config.Error(pod, err)
s.config.Recorder.Eventf(pod, v1.EventTypeNormal, "FailedScheduling", "Binding rejected: %v", err)
s.config.PodConditionUpdater.Update(pod, &v1.PodCondition{
Type: v1.PodScheduled,
Status: v1.ConditionFalse,
Reason: "BindingRejected",
})
return
}
metrics.BindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart))
s.config.Recorder.Eventf(pod, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v to %v", pod.Name, dest)
}()
}
其中主要的步骤包括:
(1) 从podQueue队列中,去除nextPod
(2) 利用调度算法进行调度
(3) 将调度的Pod信息以及对应的Node信息,写入到SchedulerCache.AssumePod中
(4) 记性Binding操作
其中异常流程的处理有:
(1)如果调度失败了,则执行Error函数(Error函数指针在初始化的时候赋值为) f.MakeDefaultErrorFunc(podBackoff, f.podQueue),并且Recorder.Eventf和PodConditionUpdater
(2)如果binding失败,则SchedulerCache.ForgetPod(&assumed),同时重复步骤(1)
在处理Pod信息处理后,Pod的信息是标记在AssumePod中,这时有两条路径会对AssumePod中的Pod进行处理
路径1: 在Queue队列中,通过监听已经调度成功的Pod信息对AssumePod中Pod信息进行刷新。相关的代码:
func (c *controller) processLoop() {
for {
obj, err := c.config.Queue.Pop(PopProcessFunc(c.config.Process))
if err != nil {
if err == FIFOClosedError {
return
}
if c.config.RetryOnError {
// This is the safe way to re-enqueue.
c.config.Queue.AddIfNotPresent(obj)
}
}
}
}
Process函数指针中处理函数为:
Process: func(obj interface{}) error {
// from oldest to newest
for _, d := range obj.(Deltas) {
switch d.Type {
case Sync, Added, Updated:
if old, exists, err := clientState.Get(d.Object); err == nil && exists {
if err := clientState.Update(d.Object); err != nil {
return err
}
h.OnUpdate(old, d.Object)
} else {
if err := clientState.Add(d.Object); err != nil {
return err
}
h.OnAdd(d.Object)
}
case Deleted:
if err := clientState.Delete(d.Object); err != nil {
return err
}
h.OnDelete(d.Object)
}
}
return nil
},
其中onAdd,onUpdate和OnDelete为函数指针参数,外层传入的参数为:
cache.ResourceEventHandlerFuncs{
AddFunc: c.addPodToCache,
UpdateFunc: c.updatePodInCache,
DeleteFunc: c.deletePodFromCache,
},
在ConfigFactory中对应的处理为(以Add为例):
// TODO(harryz) need to update all the handlers here and below for equivalence cache
func (c *ConfigFactory) addPodToCache(obj interface{}) {
pod, ok := obj.(*v1.Pod)
if !ok {
glog.Errorf("cannot convert to *v1.Pod: %v", obj)
return
}
if err := c.schedulerCache.AddPod(pod); err != nil {
glog.Errorf("scheduler cache AddPod failed: %v", err)
}
}
从而实现为assumePods的更新
路径2: 在schedulerCache中cleanupExpiredAssumedPods,定时的轮询,清理过期的Pod
// cleanupAssumedPods exists for making test deterministic by taking time as input argument.
func (cache *schedulerCache) cleanupAssumedPods(now time.Time) {
cache.mu.Lock()
defer cache.mu.Unlock()
// The size of assumedPods should be small
for key := range cache.assumedPods {
ps, ok := cache.podStates[key]
if !ok {
panic("Key found in assumed set but not in podStates. Potentially a logical error.")
}
if !ps.bindingFinished {
glog.Warningf("Couldn't expire cache for pod %v/%v. Binding is still in progress.",
ps.pod.Namespace, ps.pod.Name)
continue
}
if now.After(*ps.deadline) {
glog.Warningf("Pod %s/%s expired", ps.pod.Namespace, ps.pod.Name)
if err := cache.expirePod(key, ps); err != nil {
glog.Errorf("ExpirePod failed for %s: %v", key, err)
}
}
}
}
这个函数在一个独立的协程中,协程的启动为schedulerCache创建时
func New(ttl time.Duration, stop <-chan struct{}) Cache {
cache := newSchedulerCache(ttl, cleanAssumedPeriod, stop)
cache.run()
return cache
}
func (cache *schedulerCache) run() {
go wait.Until(cache.cleanupExpiredAssumedPods, cache.period, cache.stop)
}
总结:
整体上Schedule模块,对应Pod的处理,基于list-watch机制,获取哪写Pod需要调度。然后将信息存入到queuePods缓存队列中,然后在ScheduleOne函数中,将Pod信息取出,利用调度算法进行调度。
完成调度后,将Pod的信息存入ScheduleCache的assumePods队列中再次缓存
再通过list-watch机制监听已经被调度的Pod的信息,放入到Queue队列中,在Controller的协程中,同步的更新assumePods队列中的数据进行对账处理