Author: xidianwangtao@gmail.com
摘要:我认为,Node Controller是Kubernetes几十个Controller中最为重要的Controller之一,其重要程度在Top3,然而这可能也是最为复杂的一个Controller,因此对其的源码分析,我将做一个系列文章,希望能帮助自己有一个深入浅出的理解。本博文从NodeController的Run方法作为入口,对其工作原理作了跟踪分析。
Node Controller的执行
Node Controller的Run方法如下,这是所有Node Controller真正处理逻辑的入口。
pkg/controller/node/nodecontroller.go:550
// Run starts an asynchronous loop that monitors the status of cluster nodes.
func (nc *NodeController) Run() {
go func() {
defer utilruntime.HandleCrash()
if !cache.WaitForCacheSync(wait.NeverStop, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) {
utilruntime.HandleError(fmt.Errorf("timed out waiting for caches to sync"))
return
}
// Incorporate the results of node status pushed from kubelet to master.
go wait.Until(func() {
if err := nc.monitorNodeStatus(); err != nil {
glog.Errorf("Error monitoring node status: %v", err)
}
}, nc.nodeMonitorPeriod, wait.NeverStop)
if nc.runTaintManager {
go nc.taintManager.Run(wait.NeverStop)
}
if nc.useTaintBasedEvictions {
// Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated
// taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints.
go wait.Until(nc.doTaintingPass, nodeEvictionPeriod, wait.NeverStop)
} else {
// Managing eviction of nodes:
// When we delete pods off a node, if the node was not empty at the time we then
// queue an eviction watcher. If we hit an error, retry deletion.
go wait.Until(nc.doEvictionPass, nodeEvictionPeriod, wait.NeverStop)
}
}()
}
WaitForCacheSync(wait.NeverStop, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced)
- Node Controller首先调用WaitForCacheSync,等待PodInformer、NodeInformer、DaemonSetInformer的HasSyncs都返回true,即这三个API Object都完成同步。
vendor/k8s.io/client-go/tools/cache/shared_informer.go:100
// WaitForCacheSync waits for caches to populate. It returns true if it was successful, false
// if the contoller should shutdown
func WaitForCacheSync(stopCh <-chan struct{}, cacheSyncs ...InformerSynced) bool {
// 每隔100ms遍历一次cacheSyncs中的InformerSynced方法,
// 当所有要求的cacheSyncs方法都返回true,
// 意味着所有要求的cache都已经同步后,则WaitForCacheSync返回true,
// 否则继续遍历。
err := wait.PollUntil(syncedPollPeriod,
func() (bool, error) {
for _, syncFunc := range cacheSyncs {
if !syncFunc() {
return false, nil
}
}
return true, nil
},
stopCh)
if err != nil {
glog.V(2).Infof("stop requested")
return false
}
glog.V(4).Infof("caches populated")
return true
}
WaitForCacheSync的实现逻辑是:
每隔100ms遍历一次cacheSyncs中的InformerSynced方法,当所有要求的cacheSyncs方法都返回true,意味着所有要求的cache都已经同步后,则WaitForCacheSync返回true,
否则按照100ms的周期继续遍历,知道返回true或者受到stop信号为止。
启动goruntime按照5s的周期执行monitorNodeStatus,进行Node状态监控
pkg/controller/node/nodecontroller.go:586
// monitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
// post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or
// not reachable for a long period of time.
func (nc *NodeController) monitorNodeStatus() error {
// We are listing nodes from local cache as we can tolerate some small delays
// comparing to state from etcd and there is eventual consistency anyway.
nodes, err := nc.nodeLister.List(labels.Everything())
if err != nil {
return err
}
// 对比knownNodeSet和nodes数据,得到对应的added和deleted Node列表
added, deleted := nc.checkForNodeAddedDeleted(nodes)
// 遍历added Node列表,表示Node Controller观察到一个新的Node加入集群
for i := range added {
...
// 将added node添加到knowNodeSet中
nc.knownNodeSet[added[i].Name] = added[i]
// When adding new Nodes we need to check if new zone appeared, and if so add new evictor.
zone := utilnode.GetZoneKey(added[i])
if _, found := nc.zoneStates[zone]; !found {
// 设置该Node对应的新zone状态为“Initial”
nc.zoneStates[zone] = stateInitial
// 如果Node Controller的useTaintBasedEvictions为false(--feature-gates中指定,默认TaintBasedEvictions=false),
// 则添加该zone对应的zonePodEvictor,并设置evictionLimiterQPS(--node-eviction-rate设置,默认为0.1)
if !nc.useTaintBasedEvictions {
nc.zonePodEvictor[zone] =
NewRateLimitedTimedQueue(
flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
} else {
// 如果Node Controller的useTaintBasedEvictions为true,
// 则添加该zone对应的zoneNotReadyOrUnreachableTainer,并设置evictionLimiterQPS
nc.zoneNotReadyOrUnreachableTainer[zone] =
NewRateLimitedTimedQueue(
flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
}
...
}
// 如果Node Controller的useTaintBasedEvictions为true,调用RemoveTaintOffNode将Node上对应的Taints(node.alpha.kubernetes.io/notReady和node.alpha.kubernetes.io/unreachable)清除掉,
// 并将其从zoneNotReadyOrUnreachableTainer Queue中Remove(如果它在这个Queue中)
if nc.useTaintBasedEvictions {
nc.markNodeAsHealthy(added[i])
} else {
// 如果Node Controller的useTaintBasedEvictions为false,即使用zonePodEvictor时,
// 将该node从对应的zo