前言
Kube-controller-manager组件最终启动了很多controller,其中跟node有关的有node-ipam-controller和node-lifecycle-controller(注:Kubernetes 1.10前的版本只有node-controller,1.10版本开始将node-controller解耦为node-ipam-controller和node-lifecycle-controller),本文将对node-ipam-controller的源码阅读分析。
startNodeIpamController函数
startNodeIpamController函数是Kube-controller-manager启动node-ipam-controller的入口。代码很清晰,主要就是以下三步
- 检查--allocate-node-cidrs参数(默认false)是否是true,如果不是true,则不启动node-ipam-controller,直接return
- 取得clusterCIDR和service CIDR,创建nodeIpamController实例
- go routine启动nodeIpamController的run方法,直到收到Kube-controller-manager结束的信号
k8s.io/kubernetes/cmd/kube-controller-manager/app/core.go:83 func startNodeIpamController(ctx ControllerContext) (http.Handler, bool, error) { var clusterCIDR *net.IPNet = nil var serviceCIDR *net.IPNet = nil //检查allocate node cidr参数是否是true,如果不是true,则不启动node-ipam-controller if !ctx.ComponentConfig.KubeCloudShared.AllocateNodeCIDRs { return nil, false, nil } var err error //从cluster cidr参数中获取clusterCIDR if len(strings.TrimSpace(ctx.ComponentConfig.KubeCloudShared.ClusterCIDR)) != 0 { _, clusterCIDR, err = net.ParseCIDR(ctx.ComponentConfig.KubeCloudShared.ClusterCIDR) if err != nil { glog.Warningf("Unsuccessful parsing of cluster CIDR %v: %v", ctx.ComponentConfig.KubeCloudShared.ClusterCIDR, err) } } //从service cidr参数中获取serviceCIDR if len(strings.TrimSpace(ctx.ComponentConfig.NodeIPAMController.ServiceCIDR)) != 0 { _, serviceCIDR, err = net.ParseCIDR(ctx.ComponentConfig.NodeIPAMController.ServiceCIDR) if err != nil { glog.Warningf("Unsuccessful parsing of service CIDR %v: %v", ctx.ComponentConfig.NodeIPAMController.ServiceCIDR, err) } } //创建nodeIpamController实例 nodeIpamController, err := nodeipamcontroller.NewNodeIpamController( ctx.InformerFactory.Core().V1().Nodes(), ctx.Cloud, ctx.ClientBuilder.ClientOrDie("node-controller"), clusterCIDR, serviceCIDR, int(ctx.ComponentConfig.NodeIPAMController.NodeCIDRMaskSize), ipam.CIDRAllocatorType(ctx.ComponentConfig.KubeCloudShared.CIDRAllocatorType), ) if err != nil { return nil, true, err } //go routine启动nodeIpamController的run方法 go nodeIpamController.Run(ctx.Stop) return nil, true, nil }
NodeIpamController 定义
在继续阅读NewNodeIpamController前先看一下NodeIpamController结构体的定义。
- allocatorType 通过--cidr-allocator-type参数设置,默认值为"RangeAllocator"
- cloud 通过--cloud-provider,--cloud-config,--external-cloud-volume-plugin,--allow-untagged-cloud生成的cloudprovider
- clusterCIDR 通过--cluster-cidr参数设置,表示集群中pod的CIDR range
- serviceCIDR 通过--service-cluster-ip-range参数设置,表示集群中service的CIDR range
k8s.io/kubernetes/pkg/controller/nodeipam/node_ipam_controller.go:60
type Controller struct {
allocatorType ipam.CIDRAllocatorType
cloud cloudprovider.Interface
clusterCIDR *net.IPNet
serviceCIDR *net.IPNet
kubeClient clientset.Interface
// Method for easy mocking in unittest.
lookupIP func(host string) ([]net.IP, error)
nodeLister corelisters.NodeLister
nodeInformerSynced cache.InformerSynced
cidrAllocator ipam.CIDRAllocator
forcefullyDeletePod func(*v1.Pod) error
}
NewNodeIpamController函数
从NewNodeIpamController中传的参数可知,NodeIpamController只List Watch了Node对象
k8s.io/kubernetes/cmd/kube-controller-manager/app/core.go:106
nodeIpamController, err := nodeipamcontroller.NewNodeIpamController(
ctx.InformerFactory.Core().V1().Nodes(),
ctx.Cloud,
ctx.ClientBuilder.ClientOrDie("node-controller"),
clusterCIDR,
serviceCIDR,
int(ctx.ComponentConfig.NodeIPAMController.NodeCIDRMaskSize),
ipam.CIDRAllocatorType(ctx.ComponentConfig.KubeCloudShared.CIDRAllocatorType),
)
NewNodeIpamController主要执行了以下几步:
- 启动event Broadcaster,写入日志及输出到kube-apiserver Events接口
- 将metrics输出给prometheus
- 检查是否配置clusterCIDR,如没有,写出错误日志并return
- clusterCIDR的maskSize是否大于node-cidr-mask-size,如大于,写出错误日志并return
- 创建NodeIpamController实例
- 根据--cidr-allocator-type参数的值,生成不同的cidrAllocator实例以及对nodeInformer.Informer()AddEventHandler接口注册不同的函数
k8s.io/kubernetes/pkg/controller/nodeipam/node_ipam_controller.go:83 func NewNodeIpamController( nodeInformer coreinformers.NodeInformer, cloud cloudprovider.Interface, kubeClient clientset.Interface, clusterCIDR *net.IPNet, serviceCIDR *net.IPNet, nodeCIDRMaskSize int, allocatorType ipam.CIDRAllocatorType) (*Controller, error) { if kubeClient == nil { glog.Fatalf("kubeClient is nil when starting Controller") } //启动event Broadcaster,写入日志及输出到kube-apiserver Events接口 eventBroadcaster := record.NewBroadcaster() eventBroadcaster.StartLogging(glog.Infof) glog.V(0).Infof("Sending events to api server.") eventBroadcaster.StartRecordingToSink( &v1core.EventSinkImpl{ Interface: kubeClient.CoreV1().Events(""), }) //将metrics输出给prometheus if kubeClient != nil && kubeClient.CoreV1().RESTClient().GetRateLimiter() != nil { metrics.RegisterMetricAndTrackRateLimiterUsage("node_ipam_controller", kubeClient.CoreV1().RESTClient().GetRateLimiter()) } //检查是否配置clusterCIDR了,如没有,写出错误日志并return if clusterCIDR == nil { glog.Fatal("Controller: Must specify --cluster-cidr if --allocate-node-cidrs is set") } //clusterCIDR的maskSize是否大于node-cidr-mask-size,如大于,写出错误日志并return mask := clusterCIDR.Mask if maskSize, _ := mask.Size(); maskSize > nodeCIDRMaskSize { glog.Fatal("Controller: Invalid --cluster-cidr, mask size of cluster CIDR must be less than --node-cidr-mask-size") } //创建NodeIpamController实例 ic := &Controller{ cloud: cloud, kubeClient: kubeClient, lookupIP: net.LookupIP, clusterCIDR: clusterCIDR, serviceCIDR: serviceCIDR, allocatorType: allocatorType, } //检查--cidr-allocator-type参数,如果是IPAMFromCluster或IPAMFromCloud,做特殊处理;如不是则调用ipam.New创建ic.cidrAllocator实例 // TODO: Abstract this check into a generic controller manager should run method. if ic.allocatorType == ipam.IPAMFromClusterAllocatorType || ic.allocatorType == ipam.IPAMFromCloudAllocatorType { cfg := &ipam.Config{ Resync: ipamResyncInterval, MaxBackoff: ipamMaxBackoff, InitialRetry: ipamInitialBackoff, } switch ic.allocatorType { case ipam.IPAMFromClusterAllocatorType: cfg.Mode = nodesync.SyncFromCluster case ipam.IPAMFromCloudAllocatorType: cfg.Mode = nodesync.SyncFromCloud } ipamc, err := ipam.NewController(cfg, kubeClient, cloud, clusterCIDR, serviceCIDR, nodeCIDRMaskSize) if err != nil { glog.Fatalf("Error creating ipam controller: %v", err) } if err := ipamc.Start(nodeInformer); err != nil { glog.Fatalf("Error trying to Init(): %v", err) } } else { var err error ic.cidrAllocator, err = ipam.New( kubeClient, cloud, nodeInformer, ic.allocatorType, ic.clusterCIDR, ic.serviceCIDR, nodeCIDRMaskSize) if err != nil { return nil, err } } //定义NodeIpamController的nodeLister和nodeInformerSynced ic.nodeLister = nodeInformer.Lister() ic.nodeInformerSynced = nodeInformer.Informer().HasSynced return ic, nil }
nodeIpamController.Run方法
可以清晰的看到,nodeIpamController.Run方法就三步
- Run方法开始时写入info log"Starting ipam controller",Run方法结束时写入info log"Shutting down ipam controller"
- 调用WaitForCacheSync,等待NodeInformer的HasSyncs都返回true,即等待Node Object完成同步
- 如果allocatorType(即--cidr-allocator-type参数)不是"IPAMFromCluster" & "IPAMFromCloud",则执行nc.cidrAllocator.Run方法
k8s.io/kubernetes/pkg/controller/nodeipam/node_ipam_controller.go:162
func (nc *Controller) Run(stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
glog.Infof("Starting ipam controller")
defer glog.Infof("Shutting down ipam controller")
if !controller.WaitForCacheSync("node", stopCh, nc.nodeInformerSynced) {
return
}
if nc.allocatorType != ipam.IPAMFromClusterAllocatorType && nc.allocatorType != ipam.IPAMFromCloudAllocatorType {
go nc.cidrAllocator.Run(stopCh)
}
<-stopCh
}
NodeIpamController对--cidr-allocator-type参数不同值的处理
因为NodeIpamController最终调用的是NodeIpamController结构体中cidrAllocator的Run方法,所以我们再回过头详细看看NewNodeIpamController中对cidrAllocator的处理。
- 如果--cidr-allocator-type为"IPAMFromCluster" 或者"IPAMFromCloud",调用ipam.NewController函数生成ipam包的Controller的实例,接着调用该实例的start方法,看到这里我们也就明白为什么nodeIpamController.Run方法中当allocatorType不为"IPAMFromCluster" & "IPAMFromCloud"时才执行nc.cidrAllocator.Run方法,原来是因为这里没有生成cidrAllocator实例
- 如果--cidr-allocator-type不为"IPAMFromCluster" 或者"IPAMFromCloud",则调用ipam.New,生成cidrAllocator实例
k8s.io/kubernetes/pkg/controller/nodeipam/node_ipam_controller.go:126
// TODO: Abstract this check into a generic controller manager should run method.
if ic.allocatorType == ipam.IPAMFromClusterAllocatorType || ic.allocatorType == ipam.IPAMFromCloudAllocatorType {
cfg := &ipam.Config{
Resync: ipamResyncInterval,
MaxBackoff: ipamMaxBackoff,
InitialRetry: ipamInitialBackoff,
}
switch ic.allocatorType {
case ipam.IPAMFromClusterAllocatorType:
cfg.Mode = nodesync.SyncFromCluster
case ipam.IPAMFromCloudAllocatorType:
cfg.Mode = nodesync.SyncFromCloud
}
ipamc, err := ipam.NewController(cfg, kubeClient, cloud, clusterCIDR, serviceCIDR, nodeCIDRMaskSize)
if err != nil {
glog.Fatalf("Error creating ipam controller: %v", err)
}
if err := ipamc.Start(nodeInformer); err != nil {
glog.Fatalf("Error trying to Init(): %v", err)
}
} else {
var err error
ic.cidrAllocator, err = ipam.New(
kubeClient, cloud, nodeInformer, ic.allocatorType, ic.clusterCIDR, ic.serviceCIDR, nodeCIDRMaskSize)
if err != nil {
return nil, err
}
}
ipam.NewController函数
NewController函数主要是检查传入参数并生成Controller实例
- 检查cfg.Mode是否是"SyncFromCloud"或者"SyncFromCluster",从上一步中可可得cfg.Mode必定是该值,所以这个检查一定会通过
- 对cloud做类型断言,检查是否是GCECloud,如不是则reject,可知,现在只有GCECloud支持--cidr-allocator-type为"IPAMFromCluster" 或者"IPAMFromCloud"。
- 根据clusterCIDR和nodeCIDRMaskSize生成一个cidrset
- 生成Controller实例
- 最后,检查clusterCIDR和serviceCIDR是否有重叠,如果有重叠,将重叠的CIDR在clusterCIDR中标记已使用(不检查实际是否使用)
k8s.io/kubernetes/pkg/controller/nodeipam/ipam/controller.go:63 func NewController( config *Config, kubeClient clientset.Interface, cloud cloudprovider.Interface, clusterCIDR, serviceCIDR *net.IPNet, nodeCIDRMaskSize int) (*Controller, error) { //检查cfg.Mode是否是"SyncFromCloud"或者"SyncFromCluster" if !nodesync.IsValidMode(config.Mode) { return nil, fmt.Errorf("invalid IPAM controller mode %q", config.Mode) } //对cloud进行类型断言,检查是否是gceCloud gceCloud, ok := cloud.(*gce.GCECloud) if !ok { return nil, fmt.Errorf("cloud IPAM controller does not support %q provider", cloud.ProviderName()) } //根据clusterCIDR和nodeCIDRMaskSize生成一个cidrset set, err := cidrset.NewCIDRSet(clusterCIDR, nodeCIDRMaskSize) if err != nil { return nil, err } //生成Controller实例 c := &Controller{ config: config, adapter: newAdapter(kubeClient, gceCloud), syncers: make(map[string]*nodesync.NodeSync), set: set, } //检查clusterCIDR和serviceCIDR是否有重叠,如果有重叠,将重叠的CIDR在clusterCIDR中标记已使用(不检查serviceCIDR实际是否使用) if err := occupyServiceCIDR(c.set, clusterCIDR, serviceCIDR); err != nil { return nil, err } return c, nil }
ipam.Controller.Start方法
ipam.Controller.Start方法主要有以下几步
- 获取所有的nodes
- 每个node都执行以下两步
- 检查每个node的spec.PodCIDR,如果不为空,则将Controller的cidrset的的cidr改为node.Spec.PodCIDR
- 为每个node启动一个syncer loop
- 为nodeInformer AddEventHandler注册Add/Update/Delete func
k8s.io/kubernetes/pkg/controller/nodeipam/ipam/controller.go:101
func (c *Controller) Start(nodeInformer informers.NodeInformer) error {
glog.V(0).Infof("Starting IPAM controller (config=%+v)", c.config)
nodes, err := listNodes(c.adapter.k8s)
if err != nil {
return err
}
for _, node := range nodes.Items {
if node.Spec.PodCIDR != "" {
_, cidrRange, err := net.ParseCIDR(node.Spec.PodCIDR)
if err == nil {
c.set.Occupy(cidrRange)
glog.V(3).Infof("Occupying CIDR for node %q (%v)", node.Name, node.Spec.PodCIDR)
} else {
glog.Errorf("Node %q has an invalid CIDR (%q): %v", node.Name, node.Spec.PodCIDR, err)
}
}
func() {
c.lock.Lock()
defer c.lock.Unlock()
// XXX/bowei -- stagger the start of each sync cycle.
syncer := c.newSyncer(node.Name)
c.syncers[node.Name] = syncer
go syncer.Loop(nil)
}()
}
nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: nodeutil.CreateAddNodeHandler(c.onAdd),
UpdateFunc: nodeutil.CreateUpdateNodeHandler(c.onUpdate),
DeleteFunc: nodeutil.CreateDeleteNodeHandler(c.onDelete),
})
return nil
}
如上是--cidr-allocator-type为"IPAMFromCluster" 或者"IPAMFromCloud",接着我们来看看--cidr-allocator-type为"RangeAllocator"和"CloudAllocator"会生成什么样的cidrAllocator实例。
ipam.New函数
从NewNodeIpamController中可知--cidr-allocator-type为"RangeAllocator"和"CloudAllocator"时,会调用ipam.New函数生成cidrAllocator实例。ipam.New函数逻辑很清晰,--cidr-allocator-type取值"RangeAllocator"时创建CIDRRangeAllocator,取值"CloudAllocator"时创建CloudCIDRAllocator,如果--cidr-allocator-type不是这两个值,则报错。接下来,我们分别看看CIDRRangeAllocator和CloudCIDRAllocator以及其对应的Run方法。
k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cidr_allocator.go:97
func New(kubeClient clientset.Interface, cloud cloudprovider.Interface, nodeInformer informers.NodeInformer, allocatorType CIDRAllocatorType, clusterCIDR, serviceCIDR *net.IPNet, nodeCIDRMaskSize int) (CIDRAllocator, error) {
nodeList, err := listNodes(kubeClient)
if err != nil {
return nil, err
}
switch allocatorType {
case RangeAllocatorType:
return NewCIDRRangeAllocator(kubeClient, nodeInformer, clusterCIDR, serviceCIDR, nodeCIDRMaskSize, nodeList)
case CloudAllocatorType:
return NewCloudCIDRAllocator(kubeClient, cloud, nodeInformer)
default:
return nil, fmt.Errorf("Invalid CIDR allocator type: %v", allocatorType)
}
}
CIDRRangeAllocator
NewCIDRRangeAllocator函数
NewCIDRRangeAllocator函数的逻辑为:
- 生成rangeAllocator实例
- 如果serviceCIDR不为空,检查serviceCIDR和clusterCIDR是否有重叠,如果有重叠,不管重叠部分的CIDR是否使用,都在clusterCIDR标记该部分为已使用
- 将每个node已使用的PodCIDR在Controller中make为已使用
- 为nodeInformer.Informer().AddEventHandler注册AddFunc为rangeAllocator.AllocateOrOccupyCIDR方法;注册UpdateFunc:检查node的PodCIDR是否为空,如果为空则执行rangeAllocator.AllocateOrOccupyCIDR方法;注册DeleteFunc为rangeAllocator.ReleaseCIDR方法,ReleaseCIDR方法为将删除的node中的PodCIDR从rangeAllocator中释放。因为AddFunc和UpdateFunc都调用了rangeAllocator.AllocateOrOccupyCIDR方法,那么我们接下来就看看rangeAllocator.AllocateOrOccupyCIDR方法。
k8s.io/kubernetes/pkg/controller/nodeipam/ipam/range_allocator.go:70 func NewCIDRRangeAllocator(client clientset.Interface, nodeInformer informers.NodeInformer, clusterCIDR *net.IPNet, serviceCIDR *net.IPNet, subNetMaskSize int, nodeList *v1.NodeList) (CIDRAllocator, error) { ... set, err := cidrset.NewCIDRSet(clusterCIDR, subNetMaskSize) if err != nil { return nil, err } //生成rangeAllocator实例 ra := &rangeAllocator{ client: client, cidrs: set, clusterCIDR: clusterCIDR, nodeLister: nodeInformer.Lister(), nodesSynced: nodeInformer.Informer().HasSynced, nodeCIDRUpdateChannel: make(chan nodeAndCIDR, cidrUpdateQueueSize), recorder: recorder, nodesInProcessing: sets.NewString(), } //如果serviceCIDR不为空,检查serviceCIDR和clusterCIDR是否有重叠,如果有重叠,不管重叠部分的CIDR是否使用,都在clusterCIDR标记该部分为已使用 if serviceCIDR != nil { ra.filterOutServiceRange(serviceCIDR) } else { glog.V(0).Info("No Service CIDR provided. Skipping filtering out service addresses.") } //将每个node已使用的PodCIDR在Controller中make为已使用 if nodeList != nil { for _, node := range nodeList.Items { if node.Spec.PodCIDR == "" { glog.Infof("Node %v has no CIDR, ignoring", node.Name) continue } else { glog.Infof("Node %v has CIDR %s, occupying it in CIDR map", node.Name, node.Spec.PodCIDR) } if err := ra.occupyCIDR(&node); err != nil { // This will happen if: // 1. We find garbage in the podCIDR field. Retrying is useless. // 2. CIDR out of range: This means a node CIDR has changed. // This error will keep crashing controller-manager. return nil, err } } } //为nodeInformer AddEventHandler注册Add/Update/Delete func nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: nodeutil.CreateAddNodeHandler(ra.AllocateOrOccupyCIDR), UpdateFunc: nodeutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error { // If the PodCIDR is not empty we either: // - already processed a Node that already had a CIDR after NC restarted // (cidr is marked as used), // - already processed a Node successfully and allocated a CIDR for it // (cidr is marked as used), // - already processed a Node but we did saw a "timeout" response and // request eventually got through in this case we haven't released // the allocated CIDR (cidr is still marked as used). // There's a possible error here: // - NC sees a new Node and assigns a CIDR X to it, // - Update Node call fails with a timeout, // - Node is updated by some other component, NC sees an update and // assigns CIDR Y to the Node, // - Both CIDR X and CIDR Y are marked as used in the local cache, // even though Node sees only CIDR Y // The problem here is that in in-memory cache we see CIDR X as marked, // which prevents it from being assigned to any new node. The cluster // state is correct. // Restart of NC fixes the issue. if newNode.Spec.PodCIDR == "" { return ra.AllocateOrOccupyCIDR(newNode) } return nil }), DeleteFunc: nodeutil.CreateDeleteNodeHandler(ra.ReleaseCIDR), }) return ra, nil }
rangeAllocator.AllocateOrOccupyCIDR方法
rangeAllocator.AllocateOrOccupyCIDR方法的逻辑为:
- 检查node是否已经正在做CIDR的处理,如果已在处理,立即返回
- 如果node的PodCIDR不为空,则将node的PodCIDR在rangeAllocator 标记为已使用
- 从集群中的PodCIDR中获取一块没有使用的CIDR,将获得的PodCIDR和nodename写入rangeAllocator.nodeCIDRUpdateChannel中
k8s.io/kubernetes/pkg/controller/nodeipam/ipam/range_allocator.go:223
func (r *rangeAllocator) AllocateOrOccupyCIDR(node *v1.Node) error {
if node == nil {
return nil
}
if !r.insertNodeToProcessing(node.Name) {
glog.V(2).Infof("Node %v is already in a process of CIDR assignment.", node.Name)
return nil
}
if node.Spec.PodCIDR != "" {
return r.occupyCIDR(node)
}
podCIDR, err := r.cidrs.AllocateNext()
if err != nil {
r.removeNodeFromProcessing(node.Name)
nodeutil.RecordNodeStatusChange(r.recorder, node, "CIDRNotAvailable")
return fmt.Errorf("failed to allocate cidr: %v", err)
}
glog.V(4).Infof("Putting node %s with CIDR %s into the work queue", node.Name, podCIDR)
r.nodeCIDRUpdateChannel <- nodeAndCIDR{
nodeName: node.Name,
cidr: podCIDR,
}
return nil
}
rangeAllocator.Run方法
Run方法首先调用WaitForCacheSync,等待NodeInformer的HasSyncs都返回true,即等待Node Object完成同步,接着启动30个go routine调用rangeAllocator.worker。rangeAllocator.worker方法则是从Channel:rangeAllocator.nodeCIDRUpdateChannel中读取数据,调用rangeAllocator.updateCIDRAllocation方法,如果rangeAllocator.updateCIDRAllocation返回失败,则将失败的数据再次写入Channel:rangeAllocator.nodeCIDRUpdateChannel,这样就会重复调用rangeAllocator.updateCIDRAllocation,直到成功为止。由NewCIDRRangeAllocator给nodeInformer AddEventHandler注册的Add/Update可以,其实就是在Watch 集群中node的Add和update。
k8s.io/kubernetes/pkg/controller/nodeipam/ipam/range_allocator.go:154
func (r *rangeAllocator) Run(stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
glog.Infof("Starting range CIDR allocator")
defer glog.Infof("Shutting down range CIDR allocator")
if !controller.WaitForCacheSync("cidrallocator", stopCh, r.nodesSynced) {
return
}
for i := 0; i < cidrUpdateWorkers; i++ {
go r.worker(stopCh)
}
<-stopCh
}
func (r *rangeAllocator) worker(stopChan <-chan struct{}) {
for {
select {
case workItem, ok := <-r.nodeCIDRUpdateChannel:
if !ok {
glog.Warning("Channel nodeCIDRUpdateChannel was unexpectedly closed")
return
}
if err := r.updateCIDRAllocation(workItem); err != nil {
// Requeue the failed node for update again.
r.nodeCIDRUpdateChannel <- workItem
}
case <-stopChan:
return
}
}
}
rangeAllocator.updateCIDRAllocation方法
最后,我们来看看rangeAllocator.updateCIDRAllocation是如何update node的CIDR的,逻辑如下:
- 从rangeAllocator.nodeCIDRUpdateChannel获取CIDR及根据rangeAllocator.nodeCIDRUpdateChannel中的nodename获取node object
- 检查node的PodCIDR是否和rangeAllocator.nodeCIDRUpdateChannel中的CIDR是否一致,即node是否已update了该PodCIDR,如果一致,直接return
- 如果上一步检查不一致,而node的PodCIDR又不为空,则直接释放rangeAllocator.nodeCIDRUpdateChannel中的PodCIDR,并return
- 如果node的PodCIDR为空,将node的PodCIDR设置为rangeAllocator.nodeCIDRUpdateChannel的CIDR,如果失败则重新设置,最多设置3次
- 如果设置node的CIDR失败了,则写入失败log及向kube-api-server记录失败日志
到这里CIDRRangeAllocator的逻辑就分析完了,接下来来看看CloudCIDRAllocator是怎么处理的。
k8s.io/kubernetes/pkg/controller/nodeipam/ipam/range_allocator.go:283 func (r *rangeAllocator) updateCIDRAllocation(data nodeAndCIDR) error { var err error var node *v1.Node defer r.removeNodeFromProcessing(data.nodeName) //从rangeAllocator.nodeCIDRUpdateChannel获取podCIDR podCIDR := data.cidr.String() //根据rangeAllocator.nodeCIDRUpdateChannel中的nodename获取node object node, err = r.nodeLister.Get(data.nodeName) if err != nil { glog.Errorf("Failed while getting node %v for updating Node.Spec.PodCIDR: %v", data.nodeName, err) return err } //检查node的PodCIDR是否和rangeAllocator.nodeCIDRUpdateChannel中的CIDR是否一致,即node是否已update了该PodCIDR,如果一致,直接return if node.Spec.PodCIDR == podCIDR { glog.V(4).Infof("Node %v already has allocated CIDR %v. It matches the proposed one.", node.Name, podCIDR) return nil } //如果上一步检查不一致,而node的PodCIDR又不为空,则直接释放rangeAllocator.nodeCIDRUpdateChannel中的PodCIDR if node.Spec.PodCIDR != "" { glog.Errorf("Node %v already has a CIDR allocated %v. Releasing the new one %v.", node.Name, node.Spec.PodCIDR, podCIDR) if err := r.cidrs.Release(data.cidr); err != nil { glog.Errorf("Error when releasing CIDR %v", podCIDR) } return nil } //逻辑走到这里说明node的PodCIDR为空,将node的PodCIDR设置为rangeAllocator.nodeCIDRUpdateChannel的CIDR,如果失败则重新设置,最多设置3次 // If we reached here, it means that the node has no CIDR currently assigned. So we set it. for i := 0; i < cidrUpdateRetries; i++ { if err = utilnode.PatchNodeCIDR(r.client, types.NodeName(node.Name), podCIDR); err == nil { glog.Infof("Set node %v PodCIDR to %v", node.Name, podCIDR) return nil } } //如果设置node的CIDR失败了,则写入失败log及向kube-api-server记录失败日志 glog.Errorf("Failed to update node %v PodCIDR to %v after multiple attempts: %v", node.Name, podCIDR, err) nodeutil.RecordNodeStatusChange(r.recorder, node, "CIDRAssignmentFailed") // We accept the fact that we may leak CIDRs here. This is safer than releasing // them in case when we don't know if request went through. // NodeController restart will return all falsely allocated CIDRs to the pool. if !apierrors.IsServerTimeout(err) { glog.Errorf("CIDR assignment for node %v failed: %v. Releasing allocated CIDR", node.Name, err) if releaseErr := r.cidrs.Release(data.cidr); releaseErr != nil { glog.Errorf("Error releasing allocated CIDR for node %v: %v", node.Name, releaseErr) } } return err }
CloudCIDRAllocator
NewCloudCIDRAllocator函数
NewCloudCIDRAllocator的逻辑如下:
- 检查Cloud是否是gceCloud,可以发现CloudAllocator和IPAMFromCloud的AllocatorType暂时都只支持gceCloud
- 生成cloudCIDRAllocator实例
- 为nodeInformer.Informer().AddEventHandle的Addfunc注册cloudCIDRAllocator.AllocateOrOccupyCIDR方法;Updatefunc注册为匿名函数,逻辑为:(1)如果node的PodCidr为空,则调用cloudCIDRAllocator.AllocateOrOccupyCIDR方法并返回 (2)如果node的PodCidr不为空,检查node的network,通过检查node Status及是否有node.kubernetes.io/network-unavailable的taint,如果node的status为空或者false或者有network-unavailable的taint,则调用cloudCIDRAllocator.AllocateOrOccupyCIDR方法并返回,否则返回nil;Deletefunc注册cloudCIDRAllocator.ReleaseCIDR方法(该方法只是写一笔info log)。
k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go:85
func NewCloudCIDRAllocator(client clientset.Interface, cloud cloudprovider.Interface, nodeInformer informers.NodeInformer) (CIDRAllocator, error) {
...
gceCloud, ok := cloud.(*gce.GCECloud)
if !ok {
err := fmt.Errorf("cloudCIDRAllocator does not support %v provider", cloud.ProviderName())
return nil, err
}
ca := &cloudCIDRAllocator{
client: client,
cloud: gceCloud,
nodeLister: nodeInformer.Lister(),
nodesSynced: nodeInformer.Informer().HasSynced,
nodeUpdateChannel: make(chan string, cidrUpdateQueueSize),
recorder: recorder,
nodesInProcessing: map[string]*nodeProcessingInfo{},
}
nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: nodeutil.CreateAddNodeHandler(ca.AllocateOrOccupyCIDR),
UpdateFunc: nodeutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error {
if newNode.Spec.PodCIDR == "" {
return ca.AllocateOrOccupyCIDR(newNode)
}
// Even if PodCIDR is assigned, but NetworkUnavailable condition is
// set to true, we need to process the node to set the condition.
networkUnavailableTaint := &v1.Taint{Key: algorithm.TaintNodeNetworkUnavailable, Effect: v1.TaintEffectNoSchedule}
_, cond := v1node.GetNodeCondition(&newNode.Status, v1.NodeNetworkUnavailable)
if cond == nil || cond.Status != v1.ConditionFalse || utiltaints.TaintExists(newNode.Spec.Taints, networkUnavailableTaint) {
return ca.AllocateOrOccupyCIDR(newNode)
}
return nil
}),
DeleteFunc: nodeutil.CreateDeleteNodeHandler(ca.ReleaseCIDR),
})
glog.V(0).Infof("Using cloud CIDR allocator (provider: %v)", cloud.ProviderName())
return ca, nil
}
cloudCIDRAllocator.AllocateOrOccupyCIDR方法
cloudCIDRAllocator.AllocateOrOccupyCIDR方法先检查node是否在执行cloud allocator,如果正在执行则直接返回,否则将node.name写入Channel:cloudCIDRAllocator.nodeUpdateChannel中。
k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go:229 func (ca *cloudCIDRAllocator) AllocateOrOccupyCIDR(node *v1.Node) error { if node == nil { return nil } if !ca.insertNodeToProcessing(node.Name) { glog.V(2).Infof("Node %v is already in a process of CIDR assignment.", node.Name) return nil } glog.V(4).Infof("Putting node %s into the work queue", node.Name) ca.nodeUpdateChannel <- node.Name return nil }
cloudCIDRAllocator.Run方法
Run方法首先调用WaitForCacheSync,等待NodeInformer的HasSyncs都返回true,即等待Node Object完成同步,接着启动30个go routine调用cloudCIDRAllocator.worker。cloudCIDRAllocator.worker方法则是从Channel:cloudCIDRAllocator.nodeUpdateChannel中读取数据,调用cloudCIDRAllocator.updateCIDRAllocation方法,如果失败,则重新数据写入cloudCIDRAllocator.nodeUpdateChannel,并调用cloudCIDRAllocator.updateCIDRAllocation方法,最多调用10次,如果10次以后任然失败,则将该node的从cloudCIDRallocator.nodesInProcessing删除。
k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go:134
func (ca *cloudCIDRAllocator) Run(stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
glog.Infof("Starting cloud CIDR allocator")
defer glog.Infof("Shutting down cloud CIDR allocator")
if !controller.WaitForCacheSync("cidrallocator", stopCh, ca.nodesSynced) {
return
}
for i := 0; i < cidrUpdateWorkers; i++ {
go ca.worker(stopCh)
}
<-stopCh
}
func (ca *cloudCIDRAllocator) worker(stopChan <-chan struct{}) {
for {
select {
case workItem, ok := <-ca.nodeUpdateChannel:
if !ok {
glog.Warning("Channel nodeCIDRUpdateChannel was unexpectedly closed")
return
}
if err := ca.updateCIDRAllocation(workItem); err == nil {
glog.V(3).Infof("Updated CIDR for %q", workItem)
} else {
glog.Errorf("Error updating CIDR for %q: %v", workItem, err)
if canRetry, timeout := ca.retryParams(workItem); canRetry {
glog.V(2).Infof("Retrying update for %q after %v", workItem, timeout)
time.AfterFunc(timeout, func() {
// Requeue the failed node for update again.
ca.nodeUpdateChannel <- workItem
})
continue
}
glog.Errorf("Exceeded retry count for %q, dropping from queue", workItem)
}
ca.removeNodeFromProcessing(workItem)
case <-stopChan:
return
}
}
}
cloudCIDRAllocator.updateCIDRAllocation方法
cloudCIDRAllocator.updateCIDRAllocation方法逻辑如下:
- 通过node name获取node object
- 调用GCECloud.AliasRanges方法获取一个CIDR数组,取出CIDR数组中的第一个CIDR
- 检查node的PodCIDR和上一步获得的CIDR是否一样,如一样,则不做任何处理;如不一样,但PodCIDR不为空,也不做任何处理;否则循环3次将node的PodCIDR设置为上一步获得的CIDR,如果设置成功,即时退出该循环
- 如上一步设置node的CIDR不成功,记录事件:node的Status变为CIDRAssignmentFailed
- 将node的status设置为false,type设置为NetworkUnavailable
k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go:244 func (ca *cloudCIDRAllocator) updateCIDRAllocation(nodeName string) error { //通过node name获取node object node, err := ca.nodeLister.Get(nodeName) if err != nil { if errors.IsNotFound(err) { return nil // node no longer available, skip processing } glog.Errorf("Failed while getting node %v for updating Node.Spec.PodCIDR: %v", nodeName, err) return err } //调用GCECloud.AliasRanges方法获取一个CIDR数组,取出CIDR数组中的第一个CIDR cidrs, err := ca.cloud.AliasRanges(types.NodeName(nodeName)) if err != nil { nodeutil.RecordNodeStatusChange(ca.recorder, node, "CIDRNotAvailable") return fmt.Errorf("failed to allocate cidr: %v", err) } if len(cidrs) == 0 { nodeutil.RecordNodeStatusChange(ca.recorder, node, "CIDRNotAvailable") return fmt.Errorf("failed to allocate cidr: Node %v has no CIDRs", node.Name) } _, cidr, err := net.ParseCIDR(cidrs[0]) if err != nil { return fmt.Errorf("failed to parse string '%s' as a CIDR: %v", cidrs[0], err) } podCIDR := cidr.String() //检查node的PodCIDR和上一步获得的CIDR是否一样,如一样,则不做任何处理;如不一样,但PodCIDR不为空,也不做任何处理;否则循环3次将node的PodCIDR 设置为上一步获得的CIDR,如果设置成功,即时退出该循环 if node.Spec.PodCIDR == podCIDR { glog.V(4).Infof("Node %v already has allocated CIDR %v. It matches the proposed one.", node.Name, podCIDR) // We don't return here, in order to set the NetworkUnavailable condition later below. } else { if node.Spec.PodCIDR != "" { glog.Errorf("PodCIDR being reassigned! Node %v spec has %v, but cloud provider has assigned %v", node.Name, node.Spec.PodCIDR, podCIDR) // We fall through and set the CIDR despite this error. This // implements the same logic as implemented in the // rangeAllocator. // // See https://github.com/kubernetes/kubernetes/pull/42147#discussion_r103357248 } for i := 0; i < cidrUpdateRetries; i++ { if err = utilnode.PatchNodeCIDR(ca.client, types.NodeName(node.Name), podCIDR); err == nil { glog.Infof("Set node %v PodCIDR to %v", node.Name, podCIDR) break } } } //如上一步设置node的CIDR不成功,记录事件:node的Status变为CIDRAssignmentFailed if err != nil { nodeutil.RecordNodeStatusChange(ca.recorder, node, "CIDRAssignmentFailed") glog.Errorf("Failed to update node %v PodCIDR to %v after multiple attempts: %v", node.Name, podCIDR, err) return err } //将node的status设置为false,type设置为NetworkUnavailable err = utilnode.SetNodeCondition(ca.client, types.NodeName(node.Name), v1.NodeCondition{ Type: v1.NodeNetworkUnavailable, Status: v1.ConditionFalse, Reason: "RouteCreated", Message: "NodeController create implicit route", LastTransitionTime: metav1.Now(), }) if err != nil { glog.Errorf("Error setting route status for node %v: %v", node.Name, err) } return err }
总结
至此,nodeipamController的主要逻辑都分析完成。nodeipamController一句话总结就是控制集群中Node的PodCIDR的控制器,通过设置--allocate-node-cidrs为ture开启(该参数默认为false,即该Controller默认是不开启的),通过设置--cidr-allocator-type参数启动不同的Allocator控制器,--cidr-allocator-type参数可选值为:"RangeAllocator"(默认值), "CloudAllocator", "IPAMFromCluster", "IPAMFromCloud",其中"CloudAllocator","IPAMFromCluster"和"IPAMFromCloud"暂时只支持GCECloud。