ReplicaSet维护一组在任何时候都要处于运行状态的 Pod 副本,这些Pod副本组成了一个集合。

  • 实际运行的pod数量不断调整到给定数量
  • 每个ReplicaSet所管理的Pod完全相同









func startReplicaSetController(ctx ControllerContext) (http.Handler, bool, error) {
	go replicaset.NewReplicaSetController(
	).Run(int(ctx.ComponentConfig.ReplicaSetController.ConcurrentRSSyncs), ctx.Stop)
	return nil, true, nil




// Run begins watching and syncing.
func (rsc *ReplicaSetController) Run(workers int, stopCh <-chan struct{}) {
	defer utilruntime.HandleCrash()
	defer rsc.queue.ShutDown()

	controllerName := strings.ToLower(rsc.Kind)
	klog.Infof("Starting %v controller", controllerName)
	defer klog.Infof("Shutting down %v controller", controllerName)

	if !cache.WaitForNamedCacheSync(rsc.Kind, stopCh, rsc.podListerSynced, rsc.rsListerSynced) {

	for i := 0; i < workers; i++ {
		go wait.Until(rsc.worker, time.Second, stopCh)



// RecommendedDefaultReplicaSetControllerConfiguration defaults a pointer to a
// ReplicaSetControllerConfiguration struct. This will set the recommended default
// values, but they may be subject to change between API versions. This function
// is intentionally not registered in the scheme as a "normal" `SetDefaults_Foo`
// function to allow consumers of this type to set whatever defaults for their
// embedded configs. Forcing consumers to use these defaults would be problematic
// as defaulting in the scheme is done as part of the conversion, and there would
// be no easy way to opt-out. Instead, if you want to use this defaulting method
// run it in your wrapper struct of this type in its `SetDefaults_` method.
func RecommendedDefaultReplicaSetControllerConfiguration(obj *kubectrlmgrconfigv1alpha1.ReplicaSetControllerConfiguration) {
	if obj.ConcurrentRSSyncs == 0 {
		obj.ConcurrentRSSyncs = 5


// worker runs a worker thread that just dequeues items, processes them, and marks them done.
// It enforces that the syncHandler is never invoked concurrently with the same key.
func (rsc *ReplicaSetController) worker() {
	for rsc.processNextWorkItem() {

func (rsc *ReplicaSetController) processNextWorkItem() bool {
	key, quit := rsc.queue.Get()
	if quit {
		return false
	defer rsc.queue.Done(key)

	err := rsc.syncHandler(key.(string))
	if err == nil {
		return true

	utilruntime.HandleError(fmt.Errorf("sync %q failed with %v", key, err))

	return true




// RateLimitingInterface is an interface that rate limits items being added to the queue.
type RateLimitingInterface interface {

	// AddRateLimited adds an item to the workqueue after the rate limiter says it's ok
	AddRateLimited(item interface{})

	// Forget indicates that an item is finished being retried.  Doesn't matter whether it's for perm failing
	// or for success, we'll stop the rate limiter from tracking it.  This only clears the `rateLimiter`, you
	// still have to call `Done` on the queue.
	Forget(item interface{})

	// NumRequeues returns back how many times the item was requeued
	NumRequeues(item interface{}) int

// DelayingInterface is an Interface that can Add an item at a later time. This makes it easier to
// requeue items after failures without ending up in a hot-loop.
type DelayingInterface interface {
	// AddAfter adds an item to the workqueue after the indicated duration has passed
	AddAfter(item interface{}, duration time.Duration)

type Interface interface {
	// Add marks item as needing processing.
	Add(item interface{})	
	// Get blocks until it can return an item to be processed. If shutdown = true,
	// the caller should end their goroutine. You must call Done with item when you
	// have finished processing it.
	Get() (item interface{}, shutdown bool)
	// Done marks item as done processing, and if it has been marked as dirty again
	// while it was being processed, it will be re-added to the queue for
	// re-processing.
	Done(item interface{})
	// ... 省略一些




type ReplicaSetController struct {

	// To allow injection of syncReplicaSet for testing.
	syncHandler func(rsKey string) error



// NewBaseController is the implementation of NewReplicaSetController with additional injected
// parameters so that it can also serve as the implementation of NewReplicationController.
func NewBaseController(rsInformer appsinformers.ReplicaSetInformer, podInformer coreinformers.PodInformer, kubeClient clientset.Interface, burstReplicas int,
	gvk schema.GroupVersionKind, metricOwnerName, queueName string, podControl controller.PodControlInterface) *ReplicaSetController {
	rsc.syncHandler = rsc.syncReplicaSet

	return rsc



// syncReplicaSet will sync the ReplicaSet with the given key if it has had its expectations fulfilled,
// meaning it did not expect to see any more of its pods created or deleted. This function is not meant to be
// invoked concurrently with the same key.
func (rsc *ReplicaSetController) syncReplicaSet(key string) error {
	startTime := time.Now()
	defer func() {
		klog.V(4).Infof("Finished syncing %v %q (%v)", rsc.Kind, key, time.Since(startTime))

	namespace, name, err := cache.SplitMetaNamespaceKey(key)
	if err != nil {
		return err
	rs, err := rsc.rsLister.ReplicaSets(namespace).Get(name)
	if apierrors.IsNotFound(err) {
		klog.V(4).Infof("%v %v has been deleted", rsc.Kind, key)
		return nil
	if err != nil {
		return err

	rsNeedsSync := rsc.expectations.SatisfiedExpectations(key)
	selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector)
	if err != nil {
		utilruntime.HandleError(fmt.Errorf("error converting pod selector to selector: %v", err))
		return nil

	// list all pods to include the pods that don't match the rs`s selector
	// anymore but has the stale controller ref.
	// TODO: Do the List and Filter in a single pass, or use an index.
	allPods, err := rsc.podLister.Pods(rs.Namespace).List(labels.Everything())
	if err != nil {
		return err
	// Ignore inactive pods.
	filteredPods := controller.FilterActivePods(allPods)

	// NOTE: filteredPods are pointing to objects from cache - if you need to
	// modify them, you need to copy it first.
	filteredPods, err = rsc.claimPods(rs, selector, filteredPods)
	if err != nil {
		return err

	var manageReplicasErr error
	if rsNeedsSync && rs.DeletionTimestamp == nil {
		manageReplicasErr = rsc.manageReplicas(filteredPods, rs)
	rs = rs.DeepCopy()
	newStatus := calculateStatus(rs, filteredPods, manageReplicasErr)

	// Always updates status as pods come up or die.
	updatedRS, err := updateReplicaSetStatus(rsc.kubeClient.AppsV1().ReplicaSets(rs.Namespace), rs, newStatus)
	if err != nil {
		// Multiple things could lead to this update failing. Requeuing the replica set ensures
		// Returning an error causes a requeue without forcing a hotloop
		return err
	// Resync the ReplicaSet after MinReadySeconds as a last line of defense to guard against clock-skew.
	if manageReplicasErr == nil && updatedRS.Spec.MinReadySeconds > 0 &&
		updatedRS.Status.ReadyReplicas == *(updatedRS.Spec.Replicas) &&
		updatedRS.Status.AvailableReplicas != *(updatedRS.Spec.Replicas) {
		rsc.queue.AddAfter(key, time.Duration(updatedRS.Spec.MinReadySeconds)*time.Second)
	return manageReplicasErr


  1. 根据队列中给定的ReplicasSet key,认领所涉及到的Pods
  2. 管理副本数Replicas
  3. 计算状态,并更新状态
  4. 重新更新
  1. 获取和输入key相关的所有ReplicaSets
func (rsc *ReplicaSetController) syncReplicaSet(key string) error {
	namespace, name, err := cache.SplitMetaNamespaceKey(key)
	if err != nil {
		return err

// SplitMetaNamespaceKey returns the namespace and name that
// MetaNamespaceKeyFunc encoded into key.
// TODO: replace key-as-string with a key-as-struct so that this
// packing/unpacking won't be necessary.
func SplitMetaNamespaceKey(key string) (namespace, name string, err error) {
	parts := strings.Split(key, "/")
	switch len(parts) {
	case 1:
		// name only, no namespace
		return "", parts[0], nil
	case 2:
		// namespace and name
		return parts[0], parts[1], nil

	return "", "", fmt.Errorf("unexpected key format: %q", key)


	rs, err := rsc.rsLister.ReplicaSets(namespace).Get(name)
	if apierrors.IsNotFound(err) {
		klog.V(4).Infof("%v %v has been deleted", rsc.Kind, key)
		return nil


	// A store of ReplicaSets, populated by the shared informer passed to NewReplicaSetController
	rsLister appslisters.ReplicaSetLister


	// A TTLCache of pod creates/deletes each rc expects to see.
	expectations *controller.UIDTrackingControllerExpectations


// ControllerExpectations is a cache mapping controllers to what they expect to see before being woken up for a sync.


  1. 判断是否要同步

  2. 获取所有Pods

    列出所有 Pods的目的是为了包含有过时控制器引用的 Pods,这些Pods与当前ReplicaSet的选择器是不匹配的

  3. 过滤Pods


    func IsPodActive(p *v1.Pod) bool {
    return v1.PodSucceeded != p.Status.Phase &&
    	v1.PodFailed != p.Status.Phase &&
    	p.DeletionTimestamp == nil
  4. 认领

    func (rsc *ReplicaSetController) claimPods(rs *apps.ReplicaSet, selector labels.Selector, filteredPods []*v1.Pod) ([]*v1.Pod, error) {
    // If any adoptions are attempted, we should first recheck for deletion with
    // an uncached quorum read sometime after listing Pods (see #42639).
    canAdoptFunc := controller.RecheckDeletionTimestamp(func() (metav1.Object, error) {
    	fresh, err := rsc.kubeClient.AppsV1().ReplicaSets(rs.Namespace).Get(context.TODO(), rs.Name, metav1.GetOptions{})
    	if err != nil {
    		return nil, err
    	if fresh.UID != rs.UID {
    		return nil, fmt.Errorf("original %v %v/%v is gone: got uid %v, wanted %v", rsc.Kind, rs.Namespace, rs.Name, fresh.UID, rs.UID)
    	return fresh, nil
    cm := controller.NewPodControllerRefManager(rsc.podControl, rs, selector, rsc.GroupVersionKind, canAdoptFunc)
    return cm.ClaimPods(filteredPods)


    // ClaimPods 尝试获取一组Pods的所有者。
    // 它将协调以下内容:
    // 	- 如果Selector匹配,则认领孤儿pod。
    // 	- 如果Selector不再匹配,则释放拥有的对象。
    // 可选:如果指定了一个或多个过滤器,则只有在所有过滤器都返回 true 时才会认领Pod。
    // 如果尝试某种形式的协调失败,则返回非 nil 错误。通常,如果仍需要协调,控制器应稍后重试。
    // 如果错误为 nil,则协调成功,或者不需要协调。将返回现在的 Pod 列表。
    func (m *PodControllerRefManager) ClaimPods(pods []*v1.Pod, filters ...func(*v1.Pod) bool) ([]*v1.Pod, error) {
    var claimed []*v1.Pod
    var errlist []error
    match := func(obj metav1.Object) bool {
    	pod := obj.(*v1.Pod)
    	// Check selector first so filters only run on potentially matching Pods.
    	if !m.Selector.Matches(labels.Set(pod.Labels)) {
    		return false
    	for _, filter := range filters {
    		if !filter(pod) {
    			return false
    	return true
    adopt := func(obj metav1.Object) error {
    	return m.AdoptPod(obj.(*v1.Pod))
    release := func(obj metav1.Object) error {
    	return m.ReleasePod(obj.(*v1.Pod))
    for _, pod := range pods {
    	ok, err := m.ClaimObject(pod, match, adopt, release)
    	if err != nil {
    		errlist = append(errlist, err)
    	if ok {
    		claimed = append(claimed, pod)
    return claimed, utilerrors.NewAggregate(errlist)


    • 如果一个ReplicaSet能认领,就把OwnerReferences改成自己
    • 如果不认领,则要释放这个Pod,把OwnerReferences置空


func (rsc *ReplicaSetController) syncReplicaSet(key string) error {
	// 省略...
	rsNeedsSync := rsc.expectations.SatisfiedExpectations(key)
	// 省略...
	if rsNeedsSync && rs.DeletionTimestamp == nil {
		manageReplicasErr = rsc.manageReplicas(filteredPods, rs)
	// 省略...

在先前SatisfiedExpectations的解析里提到,rsNeedsSync如果是true,表示的是如果已观察到给定ReplicaSetController所需的 adds或者dels。计数由ReplicaSetController在同步时建立,并在ReplicaSetController观察ReplicaSet时更新。

func (rsc *ReplicaSetController) manageReplicas(filteredPods []*v1.Pod, rs *apps.ReplicaSet) error {
	diff := len(filteredPods) - int(*(rs.Spec.Replicas))
	rsKey, err := controller.KeyFunc(rs)
	if err != nil {
		utilruntime.HandleError(fmt.Errorf("couldn't get key for %v %#v: %v", rsc.Kind, rs, err))
		return nil
	if diff < 0 {
		diff *= -1
		if diff > rsc.burstReplicas {
			diff = rsc.burstReplicas
		// 省略很多代码...
		err := rsc.podControl.CreatePods(rs.Namespace, &rs.Spec.Template, rs, metav1.NewControllerRef(rs, rsc.GroupVersionKind))
		// 省略很多代码...
	} else if diff > 0 {
		// 省略很多代码...
		if err := rsc.podControl.DeletePod(rs.Namespace, targetPod.Name, rs); err != nil {
		// 省略很多代码...

		// 省略很多代码...


	return nil


		// TODO:像删除一样跟踪创建的 UID。
		// 目前的问题是我们需要等待创建的结果来记录 pod 的 UID,这将需要在创建过程中锁定,这将成为性能瓶颈。
		// 我们应该为 pod 生成一个 UID 预先并通过 ExpectCreations 存储它。
		rsc.expectations.ExpectCreations(rsKey, diff)
		// 批量执行pod的创建。批量大小从 SlowStartInitialBatchSize 开始,并在每次成功迭代时以“慢启动”的方式加倍(x2)。
		// 这个处理尝试启动大量 Pod ,这些 Pod 可能会因相同的错误而失败。
		// 例如,一个配额较低的项目尝试创建大量 pod,在其 pod 之一失败后,将无法向 API 服务发送 pod 创建请求。
		// 方便的是,这还可以防止这些故障可能生成的垃圾event。
		successfulCreations, err := slowStartBatch(diff, controller.SlowStartInitialBatchSize, func() error {
			err := rsc.podControl.CreatePods(rs.Namespace, &rs.Spec.Template, rs, metav1.NewControllerRef(rs, rsc.GroupVersionKind))
			if err != nil {
				if apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) {
					// 如果命名空间被终止,我们不必执行任何操作,因为任何创建都会失败
					return nil
			return err

		// 任何我们从未尝试启动的、被跳过的Pod 都是背离预期的。跳过的 Pod 稍后将重试。下一次控制器重新同步将重试慢启动过程。
		if skippedPods := diff - successfulCreations; skippedPods > 0 {
			for i := 0; i < skippedPods; i++ {
				// Decrement the expected number of creates because the informer won't observe this pod
		return err


// SlowStartBatch 尝试调用提供的函数总共“count”次,开始时缓慢检查错误,然后在调用成功时加快速度。
// 它将调用分组为批次,从一组initialBatchSize 开始。在每个批次中,它可能会同时多次调用该函数。
// 如果整个批次成功,下一批可能会呈指数级增长(x2倍)。如果批次中有任何失败,则跳过所有剩余批次等待当前批次完成后。
// 它返回成功调用该函数的次数。
func slowStartBatch(count int, initialBatchSize int, fn func() error) (int, error) {
	remaining := count
	successes := 0
	for batchSize := integer.IntMin(remaining, initialBatchSize); batchSize > 0; batchSize = integer.IntMin(2*batchSize, remaining) {
		errCh := make(chan error, batchSize)
		var wg sync.WaitGroup
		for i := 0; i < batchSize; i++ {
			go func() {
				defer wg.Done()
				if err := fn(); err != nil {
					errCh <- err
		curSuccesses := batchSize - len(errCh)
		successes += curSuccesses
		if len(errCh) > 0 {
			return successes, <-errCh
		remaining -= batchSize
	return successes, nil


relatedPods, err := rsc.getIndirectlyRelatedPods(rs)

		// 选择要删除的 Pod,优先选择启动早期阶段的 Pod。
		podsToDelete := getPodsToDelete(filteredPods, relatedPods, diff)

		// 为我们期望看到被删除pod 的 UID (ns/name)进行快照,因此当删除时间戳的更新或删除时,要准确记录一次他们的期望。
		// 请注意,如果 pod/rs 上的标签发生变化导致该 pod 成为孤立的,则即使其他 pod 被删除,rs 也只会在期望过期后唤醒。
		rsc.expectations.ExpectDeletions(rsKey, getPodKeys(podsToDelete))

		errCh := make(chan error, diff)
		var wg sync.WaitGroup
		for _, pod := range podsToDelete {
			go func(targetPod *v1.Pod) {
				defer wg.Done()
				if err := rsc.podControl.DeletePod(rs.Namespace, targetPod.Name, rs); err != nil {
					// 减少预期的删除次数,因为informer不会观察到此删除
					podKey := controller.PodKey(targetPod)
					rsc.expectations.DeletionObserved(rsKey, podKey)
					if !apierrors.IsNotFound(err) {
						klog.V(2).Infof("Failed to delete %v, decremented expectations for %v %s/%s", podKey, rsc.Kind, rs.Namespace, rs.Name)
						errCh <- err

		select {
		case err := <-errCh:
			// all errors have been reported before and they're likely to be the same, so we'll only return the first one we hit.
			if err != nil {
				return err


  1. 未分配node节点的 Pod 会优先删除。
  2. Pending会优先删除,Running最后删除,PodUnknown 介于两者之间
  3. Not ready的 Pod 优先删除
  4. 如果设置了注解,则值较小的pod将优先出现
  5. 如果 Pod 的排名不同,则排名较高的 Pod 优先删除
  6. 如果两个 Pod 均已ready,但准备时间不同,则准备时间较短的 Pod 优先删除
  7. 如果一个 Pod 的container重启次数多于另一个 Pod 中任何container的重启次数,则包含重启次数较多container的 Pod 优先删除
  8. 如果 Pod 的创建时间不同,则最近创建的 Pod 会优先删除
// Less compares two pods with corresponding ranks and returns true if the first
// one should be preferred for deletion.
func (s ActivePodsWithRanks) Less(i, j int) bool {
	// 1. Unassigned < assigned
	// If only one of the pods is unassigned, the unassigned one is smaller
	if s.Pods[i].Spec.NodeName != s.Pods[j].Spec.NodeName && (len(s.Pods[i].Spec.NodeName) == 0 || len(s.Pods[j].Spec.NodeName) == 0) {
		return len(s.Pods[i].Spec.NodeName) == 0
	// 2. PodPending < PodUnknown < PodRunning
	if podPhaseToOrdinal[s.Pods[i].Status.Phase] != podPhaseToOrdinal[s.Pods[j].Status.Phase] {
		return podPhaseToOrdinal[s.Pods[i].Status.Phase] < podPhaseToOrdinal[s.Pods[j].Status.Phase]
	// 3. Not ready < ready
	// If only one of the pods is not ready, the not ready one is smaller
	if podutil.IsPodReady(s.Pods[i]) != podutil.IsPodReady(s.Pods[j]) {
		return !podutil.IsPodReady(s.Pods[i])

	// 4. higher pod-deletion-cost < lower pod-deletion cost
	if utilfeature.DefaultFeatureGate.Enabled(features.PodDeletionCost) {
		pi, _ := helper.GetDeletionCostFromPodAnnotations(s.Pods[i].Annotations)
		pj, _ := helper.GetDeletionCostFromPodAnnotations(s.Pods[j].Annotations)
		if pi != pj {
			return pi < pj

	// 5. Doubled up < not doubled up
	// If one of the two pods is on the same node as one or more additional
	// ready pods that belong to the same replicaset, whichever pod has more
	// colocated ready pods is less
	if s.Rank[i] != s.Rank[j] {
		return s.Rank[i] > s.Rank[j]
	// TODO: take availability into account when we push minReadySeconds information from deployment into pods,
	//       see
	// 6. Been ready for empty time < less time < more time
	// If both pods are ready, the latest ready one is smaller
	if podutil.IsPodReady(s.Pods[i]) && podutil.IsPodReady(s.Pods[j]) {
		readyTime1 := podReadyTime(s.Pods[i])
		readyTime2 := podReadyTime(s.Pods[j])
		if !readyTime1.Equal(readyTime2) {
			if !utilfeature.DefaultFeatureGate.Enabled(features.LogarithmicScaleDown) {
				return afterOrZero(readyTime1, readyTime2)
			} else {
				if s.Now.IsZero() || readyTime1.IsZero() || readyTime2.IsZero() {
					return afterOrZero(readyTime1, readyTime2)
				rankDiff := logarithmicRankDiff(*readyTime1, *readyTime2, s.Now)
				if rankDiff == 0 {
					return s.Pods[i].UID < s.Pods[j].UID
				return rankDiff < 0
	// 7. Pods with containers with higher restart counts < lower restart counts
	if maxContainerRestarts(s.Pods[i]) != maxContainerRestarts(s.Pods[j]) {
		return maxContainerRestarts(s.Pods[i]) > maxContainerRestarts(s.Pods[j])
	// 8. Empty creation time pods < newer pods < older pods
	if !s.Pods[i].CreationTimestamp.Equal(&s.Pods[j].CreationTimestamp) {
		if !utilfeature.DefaultFeatureGate.Enabled(features.LogarithmicScaleDown) {
			return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp)
		} else {
			if s.Now.IsZero() || s.Pods[i].CreationTimestamp.IsZero() || s.Pods[j].CreationTimestamp.IsZero() {
				return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp)
			rankDiff := logarithmicRankDiff(s.Pods[i].CreationTimestamp, s.Pods[j].CreationTimestamp, s.Now)
			if rankDiff == 0 {
				return s.Pods[i].UID < s.Pods[j].UID
			return rankDiff < 0
	return false



本次传入的这些pod(filteredPods)是上一步管理副本数计算时的pod,统计具有与ReplicaSet中Pod Template labels相匹配的 Pod 数量,但匹配的 Pod 数量可能多于模板中的标签数量,因为podTemplateSpec中含有的标签数大概率会少于Pod中的labels,即 podTemplateSpec 的 label 是ReplicaSet selector 的超集,所以匹配的 Pod 一定是 filteredPods的一部分。

func calculateStatus(rs *apps.ReplicaSet, filteredPods []*v1.Pod, manageReplicasErr error) apps.ReplicaSetStatus {
	newStatus := rs.Status
	fullyLabeledReplicasCount := 0
	readyReplicasCount := 0
	availableReplicasCount := 0
	templateLabel := labels.Set(rs.Spec.Template.Labels).AsSelectorPreValidated()
	for _, pod := range filteredPods {
		if templateLabel.Matches(labels.Set(pod.Labels)) {
		if podutil.IsPodReady(pod) {
			if podutil.IsPodAvailable(pod, rs.Spec.MinReadySeconds, metav1.Now()) {

	failureCond := GetCondition(rs.Status, apps.ReplicaSetReplicaFailure)
	if manageReplicasErr != nil && failureCond == nil {
		var reason string
		if diff := len(filteredPods) - int(*(rs.Spec.Replicas)); diff < 0 {
			reason = "FailedCreate"
		} else if diff > 0 {
			reason = "FailedDelete"
		cond := NewReplicaSetCondition(apps.ReplicaSetReplicaFailure, v1.ConditionTrue, reason, manageReplicasErr.Error())
		SetCondition(&newStatus, cond)
	} else if manageReplicasErr == nil && failureCond != nil {
		RemoveCondition(&newStatus, apps.ReplicaSetReplicaFailure)

	newStatus.Replicas = int32(len(filteredPods))
	newStatus.FullyLabeledReplicas = int32(fullyLabeledReplicasCount)
	newStatus.ReadyReplicas = int32(readyReplicasCount)
	newStatus.AvailableReplicas = int32(availableReplicasCount)
	return newStatus


// ReplicaSetStatus represents the current status of a ReplicaSet.
type ReplicaSetStatus struct {
	// Replicas is the most recently oberved number of replicas.
	// More info:
	Replicas int32 `json:"replicas" protobuf:"varint,1,opt,name=replicas"`

	// The number of pods that have labels matching the labels of the pod template of the replicaset.
	// +optional
	FullyLabeledReplicas int32 `json:"fullyLabeledReplicas,omitempty" protobuf:"varint,2,opt,name=fullyLabeledReplicas"`

	// The number of ready replicas for this replica set.
	// +optional
	ReadyReplicas int32 `json:"readyReplicas,omitempty" protobuf:"varint,4,opt,name=readyReplicas"`

	// The number of available replicas (ready for at least minReadySeconds) for this replica set.
	// +optional
	AvailableReplicas int32 `json:"availableReplicas,omitempty" protobuf:"varint,5,opt,name=availableReplicas"`
	// 省略。。。。。
  • Replicas 是最近观察到的副本数量
  • FullyLabeledReplicas labels与ReplicaSet中podTemplate labels相匹配的 pod 数量
  • ReadyReplicas 这个ReplicaSet中已经ready的副本数
  • AvailableReplicas 这个ReplicaSet中可用的副本数(ready,且至少持续了minReadySeconds)

ready表示的是Pod condition中的状态,Pod Condition一共有四个状态

// These are valid conditions of pod.
const (
	// ContainersReady indicates whether all containers in the pod are ready.
	ContainersReady PodConditionType = "ContainersReady"
	// PodInitialized means that all init containers in the pod have started successfully.
	PodInitialized PodConditionType = "Initialized"
	// PodReady means the pod is able to service requests and should be added to the
	// load balancing pools of all matching services.
	PodReady PodConditionType = "Ready"
	// PodScheduled represents status of the scheduling process for this pod.
	PodScheduled PodConditionType = "PodScheduled"

到了最后一步,在 MinReadySeconds 之后重新同步 ReplicaSet,作为防止时钟偏差(clock-skew)的最后一道防线。

	// Resync the ReplicaSet after MinReadySeconds as a last line of defense to guard against clock-skew.
	if manageReplicasErr == nil && updatedRS.Spec.MinReadySeconds > 0 &&
		updatedRS.Status.ReadyReplicas == *(updatedRS.Spec.Replicas) &&
		updatedRS.Status.AvailableReplicas != *(updatedRS.Spec.Replicas) {
		rsc.queue.AddAfter(key, time.Duration(updatedRS.Spec.MinReadySeconds)*time.Second)



  1. 通过labels认领Pods,并过滤
  2. 用过滤后的Pods和ReplicaSet的副本数,来管理当前实际运行的Pods,多了缩,少了扩
  3. 用过滤后的Pods来更新一次ReplicaSet的状态
  4. 重新更新来防止时钟偏差的问题
