4.4 deployment controller 02

想系统学习k8s源码,云原生的可以加:mkjnnm

上节课复习 syncDeployment()

主要逻辑:
(1)获取执行方法时的当前时间,并定义defer函数,用于计算该方法总执行时间,也即统计对一个 deployment 进行同步调谐操作的耗时;
(2)根据 deployment 对象的命名空间与名称,获取 deployment 对象;
(3)调用dc.getReplicaSetsForDeployment:对集群中与deployment对象相同命名空间下的所有replicaset对象做处理,若发现匹配但没有关联 deployment 的 replicaset 则通过设置 ownerReferences 字段与 deployment 关联,已关联但不匹配的则删除对应的 ownerReferences,最后获取返回集群中与 Deployment 关联匹配的 ReplicaSet对象列表;
(4)调用dc.getPodMapForDeployment:根据deployment对象的selector,获取当前 deployment 对象关联的 pod,根据 deployment 所属的 replicaset 对象的UID对 pod 进行分类并返回,返回值类型为map[types.UID][]*v1.Pod
(5)如果 deployment 对象的 DeletionTimestamp 属性值不为空,则调用dc.syncStatusOnly,根据deployment 所属的 replicaset 对象,重新计算出 deployment 对象的status字段值并更新,调用完成后,直接return,不继续往下执行;
(6)调用dc.checkPausedConditions:检查 deployment 是否为pause状态,是则更新deployment对象的status字段值,为其添加pause相关的condition
(7)判断deployment对象的.Spec.Paused属性值,为true时,则调用dc.sync做处理,调用完成后直接return;
(8)调用getRollbackTo检查deployment对象的annotations中是否有以下key:deprecated.deployment.rollback.to,如果有且值不为空,调用 dc.rollback 方法执行 回滚操作;
(9)调用dc.isScalingEvent:检查deployment对象是否处于 scaling 状态,是则调用dc.sync做扩缩容处理,调用完成后直接return;
(10)判断deployment对象的更新策略,当更新策略为Recreate时调用dc.rolloutRecreate做进一步处理,也即对deployment进行recreate更新处理;当更新策略为RollingUpdate时调用dc.rolloutRolling做进一步处理,也即对deployment进行滚动更新处理。

getReplicaSetFromDeployment

主要逻辑

dc.getReplicaSetsForDeployment主要作用:获取集群中与 Deployment 相关的 ReplicaSet,若发现匹配但没有关联 deployment 的 replicaset 则通过设置 ownerReferences 字段与 deployment 关联,已关联但不匹配的则删除对应的 ownerReferences。

主要逻辑如下:
(1)获取deployment对象命名空间下的所有replicaset对象;
(2)调用cm.ClaimReplicaSets对replicaset做进一步处理,并最终返回与deployment匹配关联的replicaset对象列表。

	// List ReplicaSets owned by this Deployment, while reconciling ControllerRef
	// through adoption/orphaning.
	rsList, err := dc.getReplicaSetsForDeployment(ctx, d)
	if err != nil {
		return err
	}
// getReplicaSetsForDeployment uses ControllerRefManager to reconcile
// ControllerRef by adopting and orphaning.
// It returns the list of ReplicaSets that this Deployment should manage.
func (dc *DeploymentController) getReplicaSetsForDeployment(ctx context.Context, d *apps.Deployment) ([]*apps.ReplicaSet, error) {
	// List all ReplicaSets to find those we own but that no longer match our
	// selector. They will be orphaned by ClaimReplicaSets().
	rsList, err := dc.rsLister.ReplicaSets(d.Namespace).List(labels.Everything())
	if err != nil {
		return nil, err
	}
	deploymentSelector, err := metav1.LabelSelectorAsSelector(d.Spec.Selector)
	if err != nil {
		return nil, fmt.Errorf("deployment %s/%s has invalid label selector: %v", d.Namespace, d.Name, err)
	}
	// If any adoptions are attempted, we should first recheck for deletion with
	// an uncached quorum read sometime after listing ReplicaSets (see #42639).
	canAdoptFunc := controller.RecheckDeletionTimestamp(func(ctx context.Context) (metav1.Object, error) {
		fresh, err := dc.client.AppsV1().Deployments(d.Namespace).Get(ctx, d.Name, metav1.GetOptions{})
		if err != nil {
			return nil, err
		}
		if fresh.UID != d.UID {
			return nil, fmt.Errorf("original Deployment %v/%v is gone: got uid %v, wanted %v", d.Namespace, d.Name, fresh.UID, d.UID)
		}
		return fresh, nil
	})
	cm := controller.NewReplicaSetControllerRefManager(dc.rsControl, d, deploymentSelector, controllerKind, canAdoptFunc)
	return cm.ClaimReplicaSets(ctx, rsList)
}
	// If any adoptions are attempted, we should first recheck for deletion with
	// an uncached quorum read sometime after listing ReplicaSets (see #42639).
	canAdoptFunc := controller.RecheckDeletionTimestamp(func(ctx context.Context) (metav1.Object, error) {
		fresh, err := dc.client.AppsV1().Deployments(d.Namespace).Get(ctx, d.Name, metav1.GetOptions{})
		if err != nil {
			return nil, err
		}
		if fresh.UID != d.UID {
			return nil, fmt.Errorf("original Deployment %v/%v is gone: got uid %v, wanted %v", d.Namespace, d.Name, fresh.UID, d.UID)
		}
		return fresh, nil
	})
	cm := controller.NewReplicaSetControllerRefManager(dc.rsControl, d, deploymentSelector, controllerKind, canAdoptFunc)

cm.ClaimReplicaSets

遍历与deployment对象相同命名空间下的所有replicaset对象,调用m.ClaimObject做处理,m.ClaimObject的作用主要是将匹配但没有关联 deployment 的 replicaset 则通过设置 ownerReferences 字段与 deployment 关联,已关联但不匹配的则删除对应的ownerReferences。

// ClaimReplicaSets tries to take ownership of a list of ReplicaSets.
//
// It will reconcile the following:
//   - Adopt orphans if the selector matches.
//   - Release owned objects if the selector no longer matches.
//
// A non-nil error is returned if some form of reconciliation was attempted and
// failed. Usually, controllers should try again later in case reconciliation
// is still needed.
//
// If the error is nil, either the reconciliation succeeded, or no
// reconciliation was necessary. The list of ReplicaSets that you now own is
// returned.
func (m *ReplicaSetControllerRefManager) ClaimReplicaSets(ctx context.Context, sets []*apps.ReplicaSet) ([]*apps.ReplicaSet, error) {
	var claimed []*apps.ReplicaSet
	var errlist []error

	match := func(obj metav1.Object) bool {
		return m.Selector.Matches(labels.Set(obj.GetLabels()))
	}
	adopt := func(ctx context.Context, obj metav1.Object) error {
		return m.AdoptReplicaSet(ctx, obj.(*apps.ReplicaSet))
	}
	release := func(ctx context.Context, obj metav1.Object) error {
		return m.ReleaseReplicaSet(ctx, obj.(*apps.ReplicaSet))
	}

	for _, rs := range sets {
		ok, err := m.ClaimObject(ctx, rs, match, adopt, release)
		if err != nil {
			errlist = append(errlist, err)
			continue
		}
		if ok {
			claimed = append(claimed, rs)
		}
	}
	return claimed, utilerrors.NewAggregate(errlist)
}

怎么理解 adopt 和 release

// AdoptReplicaSet sends a patch to take control of the ReplicaSet. It returns
// the error if the patching fails.
func (m *ReplicaSetControllerRefManager) AdoptReplicaSet(ctx context.Context, rs *apps.ReplicaSet) error {
    if err := m.CanAdopt(ctx); err != nil {
       return fmt.Errorf("can't adopt ReplicaSet %v/%v (%v): %v", rs.Namespace, rs.Name, rs.UID, err)
    }
    // Note that ValidateOwnerReferences() will reject this patch if another
    // OwnerReference exists with controller=true.
    patchBytes, err := ownerRefControllerPatch(m.Controller, m.controllerKind, rs.UID)
    if err != nil {
       return err
    }
    return m.rsControl.PatchReplicaSet(ctx, rs.Namespace, rs.Name, patchBytes)
}

func ownerRefControllerPatch(controller metav1.Object, controllerKind schema.GroupVersionKind, uid types.UID, finalizers ...string) ([]byte, error) {
	blockOwnerDeletion := true
	isController := true
	addControllerPatch := objectForAddOwnerRefPatch{
		Metadata: objectMetaForPatch{
			UID: uid,
			OwnerReferences: []metav1.OwnerReference{
				{
					APIVersion:         controllerKind.GroupVersion().String(),
					Kind:               controllerKind.Kind,
					Name:               controller.GetName(),
					UID:                controller.GetUID(),
					Controller:         &isController,
					BlockOwnerDeletion: &blockOwnerDeletion,
				},
			},
			Finalizers: finalizers,
		},
	}
	patchBytes, err := json.Marshal(&addControllerPatch)
	if err != nil {
		return nil, err
	}
	return patchBytes, nil
}

// 如何 patch
func (r RealRSControl) PatchReplicaSet(ctx context.Context, namespace, name string, data []byte) error {
	_, err := r.KubeClient.AppsV1().ReplicaSets(namespace).Patch(ctx, name, types.StrategicMergePatchType, data, metav1.PatchOptions{})
	return err
}


// release 
func GenerateDeleteOwnerRefStrategicMergeBytes(dependentUID types.UID, ownerUIDs []types.UID, finalizers ...string) ([]byte, error) {
	var ownerReferences []map[string]string
	for _, ownerUID := range ownerUIDs {
		ownerReferences = append(ownerReferences, ownerReference(ownerUID, "delete"))
	}
	patch := objectForDeleteOwnerRefStrategicMergePatch{
		Metadata: objectMetaForMergePatch{
			UID:              dependentUID,
			OwnerReferences:  ownerReferences,
			DeleteFinalizers: finalizers,
		},
	}
	patchBytes, err := json.Marshal(&patch)
	if err != nil {
		return nil, err
	}
	return patchBytes, nil
}

为什么我们在 Adopt 前 RecheckDeletionTimestamp??

{% embed url=“https://github.com/kubernetes/kubernetes/issues/42639” %}

According to the log of the first linked failure, and a local reproduction, the RS was not orphaned because the deployment controller re-adopted the RS that was just orphaned by the garbage collector.
根据第一次链接失败的日志和本地复制,RS 不是孤立的,因为部署控制器重新采用了刚刚被垃圾收集器孤立的 RS。

I propose this fix: 我建议这个修复:
To prevent the race, the deployment controller should get the latest deployment from API server and checks its deletionTimestamp, instead of checking its local cache, before adopting the RS. This can completely prevent re-adoption. Because if the deployment tries to adopt the RS, it means the RS lacks the controllerRef, which means the garbage collector has orphaned the RS, which means the deletionTimestamp must have already been set, so the proposed check will prevent the re-adoption. (but race for new-adoption is still possible)

The GC expects that once it sees a controller with a non-nil
DeletionTimestamp, that controller will not attempt any adoption.
There was a known race condition that could cause a controller to
re-adopt something orphaned by the GC, because the controller is using a
cached value of its own spec from before DeletionTimestamp was set.

This fixes that race by doing an uncached quorum read of the controller
spec just before the first adoption attempt. It’s important that this
read occurs after listing potential orphans. Note that this uncached
read is skipped if no adoptions are attempted (i.e. at steady state).

dc.rollback

先调用getRollbackTo检查deployment对象的annotations中是否有以下key:deprecated.deployment.rollback.to,如果有且值不为空,调用 dc.rollback 方法执行 rollback 操作;

// TODO: Remove this when extensions/v1beta1 and apps/v1beta1 Deployment are dropped.
func getRollbackTo(d *apps.Deployment) *extensions.RollbackConfig {
	// Extract the annotation used for round-tripping the deprecated RollbackTo field.
	revision := d.Annotations[apps.DeprecatedRollbackTo]
	if revision == "" {
		return nil
	}
	revision64, err := strconv.ParseInt(revision, 10, 64)
	if err != nil {
		// If it's invalid, ignore it.
		return nil
	}
	return &extensions.RollbackConfig{
		Revision: revision64,
	}
}

dc.rollback主要逻辑:
(1)获取deployment的所有关联匹配的replicaset对象列表;
(2)获取需要回滚的Revision;
(3)遍历上述获得的replicaset对象列表,比较Revision是否与需要回滚的Revision一致,一致则调用dc.rollbackToTemplate做回滚操作(主要是根据特定的Revision的replicaset对象,更改deployment对象的.Spec.Template);
(4)最后,不管有没有回滚成功,都将deployment对象的.spec.rollbackTo属性置为nil,然后更新deployment对象。

// rollback the deployment to the specified revision. In any case cleanup the rollback spec.
func (dc *DeploymentController) rollback(ctx context.Context, d *apps.Deployment, rsList []*apps.ReplicaSet) error {
	logger := klog.FromContext(ctx)
	newRS, allOldRSs, err := dc.getAllReplicaSetsAndSyncRevision(ctx, d, rsList, true)
	if err != nil {
		return err
	}

	allRSs := append(allOldRSs, newRS)
	rollbackTo := getRollbackTo(d)
	// If rollback revision is 0, rollback to the last revision
	if rollbackTo.Revision == 0 {
		if rollbackTo.Revision = deploymentutil.LastRevision(logger, allRSs); rollbackTo.Revision == 0 {
			// If we still can't find the last revision, gives up rollback
			dc.emitRollbackWarningEvent(d, deploymentutil.RollbackRevisionNotFound, "Unable to find last revision.")
			// Gives up rollback
			return dc.updateDeploymentAndClearRollbackTo(ctx, d)
		}
	}
	for _, rs := range allRSs {
		v, err := deploymentutil.Revision(rs)
		if err != nil {
			logger.V(4).Info("Unable to extract revision from deployment's replica set", "replicaSet", klog.KObj(rs), "err", err)
			continue
		}
		if v == rollbackTo.Revision {
			logger.V(4).Info("Found replica set with desired revision", "replicaSet", klog.KObj(rs), "revision", v)
			// rollback by copying podTemplate.Spec from the replica set
			// revision number will be incremented during the next getAllReplicaSetsAndSyncRevision call
			// no-op if the spec matches current deployment's podTemplate.Spec
			performedRollback, err := dc.rollbackToTemplate(ctx, d, rs)
			if performedRollback && err == nil {
				dc.emitRollbackNormalEvent(d, fmt.Sprintf("Rolled back deployment %q to revision %d", d.Name, rollbackTo.Revision))
			}
			return err
		}
	}
	dc.emitRollbackWarningEvent(d, deploymentutil.RollbackRevisionNotFound, "Unable to find the revision to rollback to.")
	// Gives up rollback
	return dc.updateDeploymentAndClearRollbackTo(ctx, d)
}

  • 23
    点赞
  • 25
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值