kafka服务端源代码分析之ReplicaManager

ReplicaManager

ReplicaManager负责管理和操作集群中Broker的副本,还承担了一部分的分区管理工作。

class ReplicaManager(val config: KafkaConfig, //kafka 配置
                     metrics: Metrics, //监控指标
                     time: Time, //定时器
                     val zkClient: KafkaZkClient, // zk 客户端
                     scheduler: Scheduler, // kafka 调度器
                     val logManager: LogManager, // 日志管理器
                     val isShuttingDown: AtomicBoolean, // 是否已经关闭
                     quotaManagers: QuotaManagers, // 配额管理器
                     val brokerTopicStats: BrokerTopicStats, // broker topic监控指标
                     val metadataCache: MetadataCache,  // broker 元数据缓存
                     logDirFailureChannel: LogDirFailureChannel,
                     // 处理延时PRODUCE请求的Purgatory
                     val delayedProducePurgatory: DelayedOperationPurgatory[DelayedProduce],
                     // 处理延时FETCH请求的Purgatory
                     val delayedFetchPurgatory: DelayedOperationPurgatory[DelayedFetch],
                     // 处理延时DELETE_RECORDS请求的Purgatory
                     val delayedDeleteRecordsPurgatory: DelayedOperationPurgatory[DelayedDeleteRecords],
                     // 处理延时ELECT_LEADERS请求的Purgatory
                     val delayedElectPreferredLeaderPurgatory: DelayedOperationPurgatory[DelayedElectPreferredLeader],
                     threadNamePrefix: Option[String]) extends Logging with KafkaMetricsGroup {
   ...
   // 隔离过期 Controller 发送的请求
  @volatile var controllerEpoch: Int = KafkaController.InitialControllerEpoch
  // 负责从leader向follower拉取数据
  val replicaFetcherManager = createReplicaFetcherManager(metrics, time, threadNamePrefix, quotaManagers.follower)
   ...
}

ReplicaManager主要实现以下功能:

  1. 副本写入
  2. 分区及副本管理
  3. ISR管理

1.副本读写

appendRecords负责副本写入,具体流程如下:
在这里插入图片描述

def appendRecords(timeout: Long,  // 请求处理超时时间
                    requiredAcks: Short, // 请求acks设置
                    internalTopicsAllowed: Boolean, // 是否允许写入内部主题
                    isFromClient: Boolean, // 写入方是否是client,有可能其他broker
                    entriesPerPartition: Map[TopicPartition, MemoryRecords], // 待写入消息
                    responseCallback: Map[TopicPartition, PartitionResponse] => Unit, // 回调逻辑
                    delayedProduceLock: Option[Lock] = None,
                    recordConversionStatsCallback: Map[TopicPartition, RecordConversionStats] => Unit = _ => ()) {
    // requiredAcks合法取值是-1,0,1,否则视为非法
    if (isValidRequiredAcks(requiredAcks)) {
      val sTime = time.milliseconds
      // 调用appendToLocalLog方法写入消息集合到本地日志
      val localProduceResults = appendToLocalLog(internalTopicsAllowed = internalTopicsAllowed,
        isFromClient = isFromClient, entriesPerPartition, requiredAcks)
      debug("Produce to local log in %d ms".format(time.milliseconds - sTime))

      val produceStatus = localProduceResults.map { case (topicPartition, result) =>
        topicPartition ->
                ProducePartitionStatus(
                  result.info.lastOffset + 1, // 设置下一条待写入消息的位移值
                  // 构建PartitionResponse封装写入结果
                  new PartitionResponse(result.error, result.info.firstOffset.getOrElse(-1), result.info.logAppendTime, result.info.logStartOffset)) // response status
      }

   recordConversionStatsCallback(localProduceResults.mapValues(_.info.recordConversionStats))
      // 需要等待其他副本完成写入
      if (delayedProduceRequestRequired(requiredAcks, entriesPerPartition, localProduceResults)) {
        val produceMetadata = ProduceMetadata(requiredAcks, produceStatus)
        // 创建DelayedProduce延时请求对象
        val delayedProduce = new DelayedProduce(timeout, produceMetadata, this, responseCallback, delayedProduceLock)
        val producerRequestKeys = entriesPerPartition.keys.map(new TopicPartitionOperationKey(_)).toSeq
        delayedProducePurgatory.tryCompleteElseWatch(delayedProduce, producerRequestKeys)

      } else {
        // 无需等待其他副本写入完成,可以立即发送Response
        val produceResponseStatus = produceStatus.mapValues(status => status.responseStatus)
        // 调用回调逻辑然后返回即可
        responseCallback(produceResponseStatus)
      }
    } else {
      val responseStatus = entriesPerPartition.map { case (topicPartition, _) =>
        topicPartition -> new PartitionResponse(Errors.INVALID_REQUIRED_ACKS,
          LogAppendInfo.UnknownLogAppendInfo.firstOffset.getOrElse(-1), RecordBatch.NO_TIMESTAMP, LogAppendInfo.UnknownLogAppendInfo.logStartOffset)
      }
      responseCallback(responseStatus)
    }
  }

fetchMessages负责读取副本数据,具体流程如下:
在这里插入图片描述

def fetchMessages(timeout: Long,  // 请求超时时间
                    replicaId: Int, // 副本id
                    fetchMinBytes: Int, // 能获取的最小字节数
                    fetchMaxBytes: Int, // 能获取的最大字节数
                    hardMaxBytesLimit: Boolean,
                    fetchInfos: Seq[(TopicPartition, PartitionData)], // 读取信息
                    quota: ReplicaQuota = UnboundedQuota,
                    // Response回调函数
                    responseCallback: Seq[(TopicPartition, FetchPartitionData)] => Unit,
                    isolationLevel: IsolationLevel) {
    // 判断该读取请求是否来自于Follower副本或Consumer
    val isFromFollower = Request.isValidBrokerId(replicaId)
    val fetchOnlyFromLeader = replicaId != Request.DebuggingConsumerId && replicaId != Request.FutureLocalReplicaId
    // 根据请求发送方判断可读取范围
    // 如果请求来自于普通消费者,那么可以读到高水位值
    // 如果请求来自于配置了READ_COMMITTED的消费者,那么可以读到Log Stable Offset值
    // 如果请求来自于Follower副本,那么可以读到LEO值
    val fetchIsolation = if (isFromFollower || replicaId == Request.FutureLocalReplicaId)
      FetchLogEnd
    else if (isolationLevel == IsolationLevel.READ_COMMITTED)
      FetchTxnCommitted
    else
      FetchHighWatermark


    def readFromLog(): Seq[(TopicPartition, LogReadResult)] = {
      val result = readFromLocalLog(
        replicaId = replicaId,
        fetchOnlyFromLeader = fetchOnlyFromLeader,
        fetchIsolation = fetchIsolation,
        fetchMaxBytes = fetchMaxBytes,
        hardMaxBytesLimit = hardMaxBytesLimit,
        readPartitionInfo = fetchInfos,
        quota = quota)
      if (isFromFollower) updateFollowerLogReadResults(replicaId, result)
      else result
    }
    // 从本地读取日志信息
    val logReadResults = readFromLog()

    // check if this fetch request can be satisfied right away
    var bytesReadable: Long = 0
    var errorReadingData = false
    val logReadResultMap = new mutable.HashMap[TopicPartition, LogReadResult]
    // 统计总共可读取的字节数
    logReadResults.foreach { case (topicPartition, logReadResult) =>
      if (logReadResult.error != Errors.NONE)
        errorReadingData = true
      bytesReadable = bytesReadable + logReadResult.info.records.sizeInBytes
      logReadResultMap.put(topicPartition, logReadResult)
    }
    // 判断是否能够立即返回Reponse,满足以下4个条件中的任意一个即可:
    // 1. 请求没有设置超时时间,说明请求方想让请求被处理后立即返回
    // 2. 未获取到任何数据
    // 3. 已累积到足够多的数据
    // 4. 读取过程中出错
    if (timeout <= 0 || fetchInfos.isEmpty || bytesReadable >= fetchMinBytes || errorReadingData) {
      // 构建返回结果
      val fetchPartitionData = logReadResults.map { case (tp, result) =>
        tp -> FetchPartitionData(result.error, result.highWatermark, result.leaderLogStartOffset, result.info.records,
          result.lastStableOffset, result.info.abortedTransactions)
      }
      // 调用回调函数
      responseCallback(fetchPartitionData)
    } else {
      // 如果无法立即完成请求
      val fetchPartitionStatus = new mutable.ArrayBuffer[(TopicPartition, FetchPartitionStatus)]
      fetchInfos.foreach { case (topicPartition, partitionData) =>
        logReadResultMap.get(topicPartition).foreach(logReadResult => {
          val logOffsetMetadata = logReadResult.info.fetchOffsetMetadata
          fetchPartitionStatus += (topicPartition -> FetchPartitionStatus(logOffsetMetadata, partitionData))
        })
      }
      val fetchMetadata = FetchMetadata(fetchMinBytes, fetchMaxBytes, hardMaxBytesLimit, fetchOnlyFromLeader,
        fetchIsolation, isFromFollower, replicaId, fetchPartitionStatus)
      // 构建DelayedFetch延时请求对象
      val delayedFetch = new DelayedFetch(timeout, fetchMetadata, this, quota, responseCallback)
      val delayedFetchKeys = fetchPartitionStatus.map { case (tp, _) => new TopicPartitionOperationKey(tp) }
      // 再一次尝试完成请求,如果依然不能完成,则交由Purgatory等待后续处理
      delayedFetchPurgatory.tryCompleteElseWatch(delayedFetch, delayedFetchKeys)
    }
  }

2.分区副本管理
分区变更是Controller给Broker发送LeaderAndIsrRequest 请求来实现的。当 Broker 端收到这类请求后,会调用副本管理器的 becomeLeaderOrFollower 方法来处理,并依次执行“成为 Leader 副本”和“成为 Follower 副本”的逻辑,令当前 Broker 互换分区 A、B 副本的角色。具体流程如下:

在这里插入图片描述

becomeLeaderOrFollower 方法

def becomeLeaderOrFollower(correlationId: Int,
                             leaderAndIsrRequest: LeaderAndIsrRequest,
                             onLeadershipChange: (Iterable[Partition], Iterable[Partition]) => Unit): LeaderAndIsrResponse = {
    leaderAndIsrRequest.partitionStates.asScala.foreach { case (topicPartition, stateInfo) =>
      stateChangeLogger.trace(s"Received LeaderAndIsr request $stateInfo " +
        s"correlation id $correlationId from controller ${leaderAndIsrRequest.controllerId} " +
        s"epoch ${leaderAndIsrRequest.controllerEpoch} for partition $topicPartition")
    }
    replicaStateChangeLock synchronized {
      // 如果LeaderAndIsrRequest携带的Controller Epoch
      // 小于当前Controller的Epoch值
      if (leaderAndIsrRequest.controllerEpoch < controllerEpoch) {
        stateChangeLogger.warn(s"Ignoring LeaderAndIsr request from controller ${leaderAndIsrRequest.controllerId} with " +
          s"correlation id $correlationId since its controller epoch ${leaderAndIsrRequest.controllerEpoch} is old. " +
          s"Latest known controller epoch is $controllerEpoch")
        // 说明Controller已经易主,抛出相应异常
        leaderAndIsrRequest.getErrorResponse(0, Errors.STALE_CONTROLLER_EPOCH.exception)
      } else {
        val responseMap = new mutable.HashMap[TopicPartition, Errors]
        val controllerId = leaderAndIsrRequest.controllerId
        // 更新当前Controller Epoch值
        controllerEpoch = leaderAndIsrRequest.controllerEpoch

        // First check partition's leader epoch
        val partitionState = new mutable.HashMap[Partition, LeaderAndIsrRequest.PartitionState]()
        val newPartitions = new mutable.HashSet[Partition]
        // 对leaderAndIsrRequest请求中的tp逐个处理
        leaderAndIsrRequest.partitionStates.asScala.foreach { case (topicPartition, stateInfo) =>
          val partition = getPartition(topicPartition).getOrElse {
            val createdPartition = getOrCreatePartition(topicPartition)
            newPartitions.add(createdPartition)
            createdPartition
          }
          val currentLeaderEpoch = partition.getLeaderEpoch
          val requestLeaderEpoch = stateInfo.basePartitionState.leaderEpoch
          // 如果分区是Offline状态
          if (partition eq ReplicaManager.OfflinePartition) {
            stateChangeLogger.warn(s"Ignoring LeaderAndIsr request from " +
              s"controller $controllerId with correlation id $correlationId " +
              s"epoch $controllerEpoch for partition $topicPartition as the local replica for the " +
              "partition is in an offline log directory")
            responseMap.put(topicPartition, Errors.KAFKA_STORAGE_ERROR)
          } else if (requestLeaderEpoch > currentLeaderEpoch) {
            if(stateInfo.basePartitionState.replicas.contains(localBrokerId))
              partitionState.put(partition, stateInfo)
            else {
              stateChangeLogger.warn(s"Ignoring LeaderAndIsr request from controller $controllerId with " +
                s"correlation id $correlationId epoch $controllerEpoch for partition $topicPartition as itself is not " +
                s"in assigned replica list ${stateInfo.basePartitionState.replicas.asScala.mkString(",")}")
              responseMap.put(topicPartition, Errors.UNKNOWN_TOPIC_OR_PARTITION)
            }
          } else {      
            stateChangeLogger.warn(s"Ignoring LeaderAndIsr request from " +
              s"controller $controllerId with correlation id $correlationId " +
              s"epoch $controllerEpoch for partition $topicPartition since its associated " +
              s"leader epoch $requestLeaderEpoch is not higher than the current " +
              s"leader epoch $currentLeaderEpoch")
            responseMap.put(topicPartition, Errors.STALE_CONTROLLER_EPOCH)
          }
        }

        // 确定Broker上副本是哪些分区的Leader副本
        val partitionsTobeLeader = partitionState.filter { case (_, stateInfo) =>
          stateInfo.basePartitionState.leader == localBrokerId
        }
        // 确定Broker上副本是哪些分区的Follower副本
        val partitionsToBeFollower = partitionState -- partitionsTobeLeader.keys

        // 调用makeLeaders方法为partitionsToBeLeader所有分区
        // 执行"成为Leader副本"的逻辑
        val partitionsBecomeLeader = if (partitionsTobeLeader.nonEmpty)
          makeLeaders(controllerId, controllerEpoch, partitionsTobeLeader, correlationId, responseMap)
        else
          Set.empty[Partition]
        // 调用makeFollowers方法为令partitionsToBeFollower所有分区
        // 执行"成为Follower副本"的逻辑
        val partitionsBecomeFollower = if (partitionsToBeFollower.nonEmpty)
          makeFollowers(controllerId, controllerEpoch, partitionsToBeFollower, correlationId, responseMap)
        else
          Set.empty[Partition]

        leaderAndIsrRequest.partitionStates.asScala.keys.foreach { topicPartition =>
          if (localReplica(topicPartition).isEmpty && (allPartitions.get(topicPartition) ne ReplicaManager.OfflinePartition))
            allPartitions.put(topicPartition, ReplicaManager.OfflinePartition)
        }

        if (!hwThreadInitialized) {
          startHighWaterMarksCheckPointThread()
          hwThreadInitialized = true
        }

        val futureReplicasAndInitialOffset = new mutable.HashMap[TopicPartition, InitialFetchState]
        for (partition <- newPartitions) {
          val topicPartition = partition.topicPartition
          if (logManager.getLog(topicPartition, isFuture = true).isDefined) {
            partition.localReplica.foreach { replica =>
              val leader = BrokerEndPoint(config.brokerId, "localhost", -1)

              // Add future replica to partition's map
              partition.getOrCreateReplica(Request.FutureLocalReplicaId, isNew = false)

              // pause cleaning for partitions that are being moved and start ReplicaAlterDirThread to move
              // replica from source dir to destination dir
              logManager.abortAndPauseCleaning(topicPartition)

              futureReplicasAndInitialOffset.put(topicPartition, InitialFetchState(leader,
                partition.getLeaderEpoch, replica.highWatermark.messageOffset))
            }
          }
        }
        replicaAlterLogDirsManager.addFetcherForPartitions(futureReplicasAndInitialOffset)

        replicaFetcherManager.shutdownIdleFetcherThreads()
        replicaAlterLogDirsManager.shutdownIdleFetcherThreads()
        onLeadershipChange(partitionsBecomeLeader, partitionsBecomeFollower)
        new LeaderAndIsrResponse(Errors.NONE, responseMap.asJava)
      }
    }
  }

makeLeaders方法

private def makeLeaders(controllerId: Int, // controller 所在的broker id
                          epoch: Int, // controller epoch值
                          partitionState: Map[Partition, LeaderAndIsrRequest.PartitionState], // LeaderAndIsrRequest请求中携带的分区信息
                          correlationId: Int, // 请求的Correlation字段,只用于日志调试
                          // 按照主题分区分组的异常错误集合
                          responseMap: mutable.Map[TopicPartition, Errors]): Set[Partition] = {
    partitionState.keys.foreach { partition =>
      stateChangeLogger.trace(s"Handling LeaderAndIsr request correlationId $correlationId from " +
        s"controller $controllerId epoch $epoch starting the become-leader transition for " +
        s"partition ${partition.topicPartition}")
    }
    // 使用Errors.NONE初始化ResponseMap
    for (partition <- partitionState.keys)
      responseMap.put(partition.topicPartition, Errors.NONE)

    val partitionsToMakeLeaders = mutable.Set[Partition]()

    try {
      // First stop fetchers for all the partitions
      // 停止partition对应的fetch线程
      replicaFetcherManager.removeFetcherForPartitions(partitionState.keySet.map(_.topicPartition))
      // Update the partition information to be the leader
      partitionState.foreach{ case (partition, partitionStateInfo) =>
        try {
          // 对每个partition分别调用makeLeader()方法,使得当前partition成为leader
          if (partition.makeLeader(controllerId, partitionStateInfo, correlationId)) {
            partitionsToMakeLeaders += partition
            stateChangeLogger.trace(s"Stopped fetchers as part of become-leader request from " +
              s"controller $controllerId epoch $epoch with correlation id $correlationId for partition ${partition.topicPartition} " +
              s"(last update controller epoch ${partitionStateInfo.basePartitionState.controllerEpoch})")
          } else
            stateChangeLogger.info(s"Skipped the become-leader state change after marking its " +
              s"partition as leader with correlation id $correlationId from controller $controllerId epoch $epoch for " +
              s"partition ${partition.topicPartition} (last update controller epoch ${partitionStateInfo.basePartitionState.controllerEpoch}) " +
              s"since it is already the leader for the partition.")
        } catch {
          case e: KafkaStorageException =>
            stateChangeLogger.error(s"Skipped the become-leader state change with " +
              s"correlation id $correlationId from controller $controllerId epoch $epoch for partition ${partition.topicPartition} " +
              s"(last update controller epoch ${partitionStateInfo.basePartitionState.controllerEpoch}) since " +
              s"the replica for the partition is offline due to disk error $e")
            val dirOpt = getLogDir(partition.topicPartition)
            error(s"Error while making broker the leader for partition $partition in dir $dirOpt", e)
            responseMap.put(partition.topicPartition, Errors.KAFKA_STORAGE_ERROR)
        }
      }

    } catch {
      case e: Throwable =>
        partitionState.keys.foreach { partition =>
          stateChangeLogger.error(s"Error while processing LeaderAndIsr request correlationId $correlationId received " +
            s"from controller $controllerId epoch $epoch for partition ${partition.topicPartition}", e)
        }
        // Re-throw the exception for it to be caught in KafkaApis
        throw e
    }

    partitionState.keys.foreach { partition =>
      stateChangeLogger.trace(s"Completed LeaderAndIsr request correlationId $correlationId from controller $controllerId " +
        s"epoch $epoch for the become-leader transition for partition ${partition.topicPartition}")
    }

    partitionsToMakeLeaders
  }

makeFollowers方法

private def makeFollowers(controllerId: Int, // controller所在的broker id
                            epoch: Int, // Controller Epoch值
                            partitionStates: Map[Partition, LeaderAndIsrRequest.PartitionState], // 当前Broker是Follower副本的所有分区的详细信息
                            correlationId: Int, // 连接请求与响应的关联字段
                            responseMap: mutable.Map[TopicPartition, Errors] // 封装LeaderAndIsrRequest请求处理结果的字段
                           ) : Set[Partition] = {
    partitionStates.foreach { case (partition, partitionState) =>
      stateChangeLogger.trace(s"Handling LeaderAndIsr request correlationId $correlationId from controller $controllerId " +
        s"epoch $epoch starting the become-follower transition for partition ${partition.topicPartition} with leader " +
        s"${partitionState.basePartitionState.leader}")
    }
    // 将所有分区的处理结果的状态初始化为Errors.NONE
    for (partition <- partitionStates.keys)
      responseMap.put(partition.topicPartition, Errors.NONE)

    val partitionsToMakeFollower: mutable.Set[Partition] = mutable.Set()

    try {
      // 遍历partitionStates所有分区
      partitionStates.foreach { case (partition, partitionStateInfo) =>
        val newLeaderBrokerId = partitionStateInfo.basePartitionState.leader
        try {
          // 在元数据缓存中找到Leader Broke对象
          metadataCache.getAliveBrokers.find(_.id == newLeaderBrokerId) match {
            case Some(_) =>
              //当副本leader可用时,通过partition.makeFollower()方法进行设置leader,清空isr的工作
              if (partition.makeFollower(controllerId, partitionStateInfo, correlationId))
                partitionsToMakeFollower += partition
              else
                stateChangeLogger.info(s"Skipped the become-follower state change after marking its partition as " +
                  s"follower with correlation id $correlationId from controller $controllerId epoch $epoch " +
                  s"for partition ${partition.topicPartition} (last update " +
                  s"controller epoch ${partitionStateInfo.basePartitionState.controllerEpoch}) " +
                  s"since the new leader $newLeaderBrokerId is the same as the old leader")
            case None =>
              stateChangeLogger.error(s"Received LeaderAndIsrRequest with correlation id $correlationId from " +
                s"controller $controllerId epoch $epoch for partition ${partition.topicPartition} " +
                s"(last update controller epoch ${partitionStateInfo.basePartitionState.controllerEpoch}) " +
                s"but cannot become follower since the new leader $newLeaderBrokerId is unavailable.")
              // 依然创建出分区Follower副本的日志对象
              partition.getOrCreateReplica(localBrokerId, isNew = partitionStateInfo.isNew)
          }
        } catch {
          case e: KafkaStorageException =>
            stateChangeLogger.error(s"Skipped the become-follower state change with correlation id $correlationId from " +
              s"controller $controllerId epoch $epoch for partition ${partition.topicPartition} " +
              s"(last update controller epoch ${partitionStateInfo.basePartitionState.controllerEpoch}) with leader " +
              s"$newLeaderBrokerId since the replica for the partition is offline due to disk error $e")
            val dirOpt = getLogDir(partition.topicPartition)
            error(s"Error while making broker the follower for partition $partition with leader " +
              s"$newLeaderBrokerId in dir $dirOpt", e)
            responseMap.put(partition.topicPartition, Errors.KAFKA_STORAGE_ERROR)
        }
      }
      // 移除现有Fetcher线程
      replicaFetcherManager.removeFetcherForPartitions(partitionsToMakeFollower.map(_.topicPartition))
      partitionsToMakeFollower.foreach { partition =>
        stateChangeLogger.trace(s"Stopped fetchers as part of become-follower request from controller $controllerId " +
          s"epoch $epoch with correlation id $correlationId for partition ${partition.topicPartition} with leader " +
          s"${partitionStates(partition).basePartitionState.leader}")
      }

      // 尝试完成延迟请求
      partitionsToMakeFollower.foreach { partition =>
        val topicPartitionOperationKey = new TopicPartitionOperationKey(partition.topicPartition)
        tryCompleteDelayedProduce(topicPartitionOperationKey)
        tryCompleteDelayedFetch(topicPartitionOperationKey)
      }

      partitionsToMakeFollower.foreach { partition =>
        stateChangeLogger.trace(s"Truncated logs and checkpointed recovery boundaries for partition " +
          s"${partition.topicPartition} as part of become-follower request with correlation id $correlationId from " +
          s"controller $controllerId epoch $epoch with leader ${partitionStates(partition).basePartitionState.leader}")
      }

      if (isShuttingDown.get()) {
        partitionsToMakeFollower.foreach { partition =>
          stateChangeLogger.trace(s"Skipped the adding-fetcher step of the become-follower state " +
            s"change with correlation id $correlationId from controller $controllerId epoch $epoch for " +
            s"partition ${partition.topicPartition} with leader ${partitionStates(partition).basePartitionState.leader} " +
            "since it is shutting down")
        }
      }
      else {
        // 为需要将当前Broker设置为Follower副本的分区
        // 确定Leader Broker和起始读取位移值fetchOffset
        // we do not need to check if the leader exists again since this has been done at the beginning of this process
        val partitionsToMakeFollowerWithLeaderAndOffset = partitionsToMakeFollower.map { partition =>
          val leader = metadataCache.getAliveBrokers.find(_.id == partition.leaderReplicaIdOpt.get).get
            .brokerEndPoint(config.interBrokerListenerName)
          val fetchOffset = partition.localReplicaOrException.highWatermark.messageOffset
          partition.topicPartition -> InitialFetchState(leader, partition.getLeaderEpoch, fetchOffset)
        }.toMap
        // 使用上一步确定的Leader Broker和fetchOffset添加新的Fetcher线程
        replicaFetcherManager.addFetcherForPartitions(partitionsToMakeFollowerWithLeaderAndOffset)

        partitionsToMakeFollower.foreach { partition =>
          stateChangeLogger.trace(s"Started fetcher to new leader as part of become-follower " +
            s"request from controller $controllerId epoch $epoch with correlation id $correlationId for " +
            s"partition ${partition.topicPartition} with leader ${partitionStates(partition).basePartitionState.leader}")
        }
      }
    } catch {
      case e: Throwable =>
        stateChangeLogger.error(s"Error while processing LeaderAndIsr request with correlationId $correlationId " +
          s"received from controller $controllerId epoch $epoch", e)
        throw e
    }

    partitionStates.keys.foreach { partition =>
      stateChangeLogger.trace(s"Completed LeaderAndIsr request correlationId $correlationId from controller $controllerId " +
        s"epoch $epoch for the become-follower transition for partition ${partition.topicPartition} with leader " +
        s"${partitionStates(partition).basePartitionState.leader}")
    }

    partitionsToMakeFollower
  }

3.ISR 管理
除了以上两个功能,副本管理器还有一个重要的功能,那就是管理 ISR。
主要体现在两个方法:

  • maybeShrinkIsr 方法:阶段性地查看 ISR 中的副本集合是否需要收缩。
  • maybePropagateIsrChanges方法:定期向集群 Broker 传播 ISR 的变更。

ReplicaManager startup方法中会创建两个定时线程,分别是isr-expiration和isr-change-propagation

def startup() {
    scheduler.schedule("isr-expiration", maybeShrinkIsr _, period = config.replicaLagTimeMaxMs / 2, unit = TimeUnit.MILLISECONDS)
    scheduler.schedule("isr-change-propagation", maybePropagateIsrChanges _, period = 2500L, unit = TimeUnit.MILLISECONDS)
    ...
}

maybeShrinkIsr会调用Partition类的maybeShrinkIsr方法来缩减ISR

private def maybeShrinkIsr(): Unit = {
    trace("Evaluating ISR list of partitions to see which replicas can be removed from the ISR")

    // Shrink ISRs for non offline partitions
    allPartitions.keys.foreach { topicPartition =>
      nonOfflinePartition(topicPartition).foreach(_.maybeShrinkIsr(config.replicaLagTimeMaxMs))
    }
  }

Partition类maybeShrinkIsr方法会调用getOutOfSyncReplicas方法获取需要从ISR列表中移除的副本。

def maybeShrinkIsr(replicaMaxLagTimeMs: Long) {
    val leaderHWIncremented = inWriteLock(leaderIsrUpdateLock) {
      leaderReplicaIfLocal match {
        case Some(leaderReplica) =>
          // 获取需要从isr列表中移除的副本
          val outOfSyncReplicas = getOutOfSyncReplicas(leaderReplica, replicaMaxLagTimeMs)
          if (outOfSyncReplicas.nonEmpty) {
            // 从isr列表中移除副本
            val newInSyncReplicas = inSyncReplicas -- outOfSyncReplicas
            assert(newInSyncReplicas.nonEmpty)
            info("Shrinking ISR from %s to %s. Leader: (highWatermark: %d, endOffset: %d). Out of sync replicas: %s."
              .format(inSyncReplicas.map(_.brokerId).mkString(","),
                newInSyncReplicas.map(_.brokerId).mkString(","),
                leaderReplica.highWatermark.messageOffset,
                leaderReplica.logEndOffset,
                outOfSyncReplicas.map { replica =>
                  s"(brokerId: ${replica.brokerId}, endOffset: ${replica.logEndOffset})"
                }.mkString(" ")
              )
            )
            // 更新isr列表缓存
            updateIsr(newInSyncReplicas)
            replicaManager.isrShrinkRate.mark()
            maybeIncrementLeaderHW(leaderReplica)
          } else {
            false
          }

        case None => false // do nothing if no longer leader
      }
    }
    
    if (leaderHWIncremented)
      tryCompleteDelayedRequests()
  }

getOutOfSyncReplicas判断延迟的副本的条件是当前时间减去上一次同步数据的时间大于replica.lag.time.max.ms参数的值。

def getOutOfSyncReplicas(leaderReplica: Replica, maxLagMs: Long): Set[Replica] = {
    val candidateReplicas = inSyncReplicas - leaderReplica
    // 过滤延迟的副本,条件是当前时间减去上一次同步数据的时间大于maxLagMs
    // 如果副本长时间没有从leader副本拉取数据,则会被踢出isr
    val laggingReplicas = candidateReplicas.filter(r =>
      r.logEndOffset != leaderReplica.logEndOffset && (time.milliseconds - r.lastCaughtUpTimeMs) > maxLagMs)
    if (laggingReplicas.nonEmpty)
      debug("Lagging replicas are %s".format(laggingReplicas.map(_.brokerId).mkString(",")))

    laggingReplicas
  }

ISR 收缩之后,ReplicaManager 还需要将这个操作的结果传递给集群的其他 Broker,以同步这个操作的结果。这个是由maybePropagateIsrChanges完成的。

def maybePropagateIsrChanges() {
    val now = System.currentTimeMillis()
    isrChangeSet synchronized {
      // ISR变更传播的条件,需要同时满足:
      // 1. 存在尚未被传播的ISR变更
      // 2. 最近5秒没有任何ISR变更,或者自上次ISR变更已经有超过1分钟的时间
      if (isrChangeSet.nonEmpty &&
        (lastIsrChangeMs.get() + ReplicaManager.IsrChangePropagationBlackOut < now ||
          lastIsrPropagationMs.get() + ReplicaManager.IsrChangePropagationInterval < now)) {
        // 创建ZooKeeper相应的Znode节点
        zkClient.propagateIsrChanges(isrChangeSet)
        // 清空isrChangeSet集合
        isrChangeSet.clear()
        // 更新最近ISR变更时间戳
        lastIsrPropagationMs.set(now)
      }
    }
  }
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值