ReplicaManager主要是管理一个Broker范围内的Partition信息,主要字段如下:
class ReplicaManager(val config: KafkaConfig,
metrics: Metrics,
time: Time,
jTime: JTime,
val zkUtils: ZkUtils,
//KafkaSchedule对象,执行三个定时任务,hw-checkpoint任务、isr-expiration任务和isr-change-progagation任务
scheduler: Scheduler,
//logManager对象,对分区的读写操作都委托给底层的日志存储子系统
val logManager: LogManager,
val isShuttingDown: AtomicBoolean,
threadNamePrefix: Option[String] = None) extends Logging with KafkaMetricsGroup {
/* epoch of the controller that last changed the leader */
//KafkaController的年代信息,当重新选举出Controller Leader时字段值会递增。之后,在ReplicaManager处理来自KafkaContoller的请求时,会先检测请求中携带的年代信息是否等于controllerEpoch字段的值。
// 避免接受旧的controller leader发送的请求。
@volatile var controllerEpoch: Int = KafkaController.InitialControllerEpoch - 1
// broker的ID
private val localBrokerId = config.brokerId
//allPartitions保存当前Broker上分配的所有Partition信息,这里需要注意pool的valueFactory,当从Poll查找不到指定key时,则使用valueFactory创建一个默认value值放入Pools中返回。
private val allPartitions = new Pool[(String, Int), Partition](valueFactory = Some { case (t, p) =>
new Partition(t, p, time, this)
})
private val replicaStateChangeLock = new Object
// 在replicaFetcherManager中管理多个ReplicaFetcherThread线程,ReplicaFetcherThread线程会向leader副本发送FetchRequest请求来获取消息,实现Follower副本与leader副本同步。
val replicaFetcherManager = new ReplicaFetcherManager(config, this, metrics, jTime, threadNamePrefix)
private val highWatermarkCheckPointThreadStarted = new AtomicBoolean(false)
// 缓存每个log和offsetcheckpoint之间的对应关系,OffsetCheckPoint记录了每个log目录下的replication-offset-checkpoint文件,这个文件记录每个分区的hw。ReplicaManager中的hw-checkpoint任务会定时更新replication-offset-checkpoint文件的内容
val highWatermarkCheckpoints = config.logDirs.map(dir => (new File(dir).getAbsolutePath, new OffsetCheckpoint(new File(dir, ReplicaManager.HighWatermarkFilename)))).toMap
private var hwThreadInitialized = false
this.logIdent = "[Replica Manager on Broker " + localBrokerId + "]: "
val stateChangeLogger = KafkaController.stateChangeLogger
// isr集合发生变化的分区信息
private val isrChangeSet: mutable.Set[TopicAndPartition] = new mutable.HashSet[TopicAndPartition]()
private val lastIsrChangeMs = new AtomicLong(System.currentTimeMillis())
private val lastIsrPropagationMs = new AtomicLong(System.currentTimeMillis())
// 用于管理DelayedProduce和DelayedFetch
val delayedProducePurgatory = DelayedOperationPurgatory[DelayedProduce](
purgatoryName = "Produce", config.brokerId, config.producerPurgatoryPurgeIntervalRequests)
val delayedFetchPurgatory = DelayedOperationPurgatory[DelayedFetch](
purgatoryName = "Fetch", config.brokerId, config.fetchPurgatoryPurgeIntervalRequests)
}
副本角色切换
在kafka集群中会选举一个Broker成为KafkaController的Leader,它负责管理整个kafka集群。Controller Leader根据分区的leader副本和follower副本的状态向对应的Broker节点发送LeaderAndISRRequest,这个请求主要用来副本的角色切换,指导Broker把上面的哪些分区的副本切换成leader或者follower。
ReplicaManager.becomeLeaderOrFollower方法的主要逻辑是:获取指定的Partition对象,根据partitionStates的信息对其切换成leader/follower进行分类,分别调用上一节介绍的makeLeader和makeFollower方法完成切换。
def becomeLeaderOrFollower(correlationId: Int,leaderAndISRRequest: LeaderAndIsrRequest,
metadataCache: MetadataCache,
onLeadershipChange: (Iterable[Partition], Iterable[Partition]) => Unit): BecomeLeaderOrFollowerResult = {
//打印日志
leaderAndISRRequest.partitionStates.asScala.foreach { case (topicPartition, stateInfo) =>
stateChangeLogger.trace("Broker %d received LeaderAndIsr request %s correlation id %d from controller %d epoch %d for partition [%s,%d]"
.format(localBrokerId, stateInfo, correlationId,
leaderAndISRRequest.controllerId, leaderAndISRRequest.controllerEpoch, topicPartition.topic, topicPartition.partition))
}
replicaStateChangeLock synchronized {
//统计返回的错误码
val responseMap = new mutable.HashMap[TopicPartition, Short]
//检测controllerEpoch,如果这个leaderAndISRRequest请求的controllerEpoch小于节点的controllerEpoch,则忽略这个请求
if (leaderAndISRRequest.controllerEpoch < controllerEpoch) {
leaderAndISRRequest.partitionStates.asScala.foreach { case (topicPartition, stateInfo) =>
stateChangeLogger.warn(("Broker %d ignoring LeaderAndIsr request from controller %d with correlation id %d since " +
"its controller epoch %d is old. Latest known controller epoch is %d").format(localBrokerId, leaderAndISRRequest.controllerId,
correlationId, leaderAndISRRequest.controllerEpoch, controllerEpoch))
}
BecomeLeaderOrFollowerResult(responseMap, Errors.STALE_CONTROLLER_EPOCH.code)
} else {
//更新controllerEpoch
val controllerId = leaderAndISRRequest.controllerId
controllerEpoch = leaderAndISRRequest.controllerEpoch
// First check partition's leader epoch
//统计进行切换需要使用的信息,partitionState
val partitionState = new mutable.HashMap[Partition, PartitionState]()
leaderAndISRRequest.partitionStates.asScala.foreach {
case (topicPartition, stateInfo) =>
//从allPartitions中获取Partition对象,找不到就创建新Partition对象
val partition = getOrCreatePartition(topicPartition.topic, topicPartition.partition)
val partitionLeaderEpoch = partition.getLeaderEpoch()
// If the leader epoch is valid record the epoch of the controller that made the leadership decision.
// This is useful while updating the isr to maintain the decision maker controller's epoch in the zookeeper path
// 检测leaderEpoch
if (partitionLeaderEpoch < stateInfo.leaderEpoch) {
// 判断这个分区的副本是否被分配到了当前的broker
if(stateInfo.replicas.contains(config.brokerId))
//保留当前broker相关的partition和partitionState
partitionState.put(partition, stateInfo)
else {
stateChangeLogger.warn(("Broker %d ignoring LeaderAndIsr request from controller %d with correlation id %d " +
"epoch %d for partition [%s,%d] as itself is not in assigned replica list %s")
.format(localBrokerId, controllerId, correlationId, leaderAndISRRequest.controllerEpoch,
topicPartition.topic, topicPartition.partition, stateInfo.replicas.asScala.mkString(",")))
responseMap.put(topicPartition, Errors.UNKNOWN_TOPIC_OR_PARTITION.code)
}
} else {
// Otherwise record the error code in response
stateChangeLogger.warn(("Broker %d ignoring LeaderAndIsr request from controller %d with correlation id %d " +
"epoch %d for partition [%s,%d] since its associated leader epoch %d is old. Current leader epoch is %d")
.format(localBrokerId, controllerId, correlationId, leaderAndISRRequest.controllerEpoch,
topicPartition.topic, topicPartition.partition, stateInfo.leaderEpoch, partitionLeaderEpoch))
responseMap.put(topicPartition, Errors.STALE_CONTROLLER_EPOCH.code)
}
}
//根据partitionStates中指定的角色进行分类,选出partitionsTobeLeader和partitionsToBeFollower
val partitionsTobeLeader = partitionState.filter { case (partition, stateInfo) =>
stateInfo.leader == config.brokerId
}
val partitionsToBeFollower = (partitionState -- partitionsTobeLeader.keys)
//指定分区副本切换为leader副本
val partitionsBecomeLeader = if (!partitionsTobeLeader.isEmpty)
makeLeaders(controllerId, controllerEpoch, partitionsTobeLeader, correlationId, responseMap)
else
Set.empty[Partition]
//指定分区副本切换为follower副本
val partitionsBecomeFollower = if (!partitionsToBeFollower.isEmpty)
makeFollowers(controllerId, controllerEpoch, partitionsToBeFollower, correlationId, responseMap, metadataCache)
else
Set.empty[Partition]
// we initialize highwatermark thread after the first leaderisrrequest. This ensures that all the partitions
// have been completely populated before starting the checkpointing there by avoiding weird race conditions
// 启动hw-checkpoint任务
if (!hwThreadInitialized) {
startHighWaterMarksCheckPointThread()
hwThreadInitialized = true
}
//关闭replicaFetcherManager中空闲的fetcher进程
replicaFetcherManager.shutdownIdleFetcherThreads()
onLeadershipChange(partitionsBecomeLeader, partitionsBecomeFollower)
BecomeLeaderOrFollowerResult(responseMap, Errors.NONE.code)
}
}
}
ReplicaManager.makdeLeaders把指定分区的LocalReplica切换为leader,如果是从Follower副本切换成Leader副本,那么要先停止相关的Fetcher线程,然后调用Partition.makeLeaders
private def makeLeaders(controllerId: Int,
epoch: Int,
partitionState: Map[Partition, PartitionState],
correlationId: Int,
responseMap: mutable.Map[TopicPartition, Short]): Set[Partition] = {
partitionState.foreach(state =>
stateChangeLogger.trace(("Broker %d handling LeaderAndIsr request correlationId %d from controller %d epoch %d " +
"starting the become-leader transition for partition %s")
.format(localBrokerId, correlationId, controllerId, epoch, TopicAndPartition(state._1.topic, state._1.partitionId))))
// 初始化每个分区对应的错误码为Errors.NONE.code
for (partition <- partitionState.keys)
responseMap.put(new TopicPartition(partition.topic, partition.partitionId), Errors.NONE.code)
val partitionsToMakeLeaders: mutable.Set[Partition] = mutable.Set()
try {
// First stop fetchers for all the partitions
// 在此broker上的副本之前可能是follower,那么要暂停这些副本的fetch操作
replicaFetcherManager.removeFetcherForPartitions(partitionState.keySet.map(new TopicAndPartition(_)))
// Update the partition information to be the leader
// 调用Partition.makeLeader方法,把分区的localreplica切换成leader
partitionState.foreach{ case (partition, partitionStateInfo) =>
if (partition.makeLeader(controllerId, partitionStateInfo, correlationId))
// 记录成功切换成leader的分区
partitionsToMakeLeaders += partition
else
stateChangeLogger.info(("Broker %d skipped the become-leader state change after marking its partition as leader with correlation id %d from " +
"controller %d epoch %d for partition %s since it is already the leader for the partition.")
.format(localBrokerId, correlationId, controllerId, epoch, TopicAndPartition(partition.topic, partition.partitionId)));
}
partitionsToMakeLeaders.foreach { partition =>
stateChangeLogger.trace(("Broker %d stopped fetchers as part of become-leader request from controller " +
"%d epoch %d with correlation id %d for partition %s")
.format(localBrokerId, controllerId, epoch, correlationId, TopicAndPartition(partition.topic, partition.partitionId)))
}
} catch {
case e: Throwable =>
partitionState.foreach { state =>
val errorMsg = ("Error on broker %d while processing LeaderAndIsr request correlationId %d received from controller %d" +
" epoch %d for partition %s").format(localBrokerId, correlationId, controllerId, epoch,
TopicAndPartition(state._1.topic, state._1.partitionId))
stateChangeLogger.error(errorMsg, e)
}
// Re-throw the exception for it to be caught in KafkaApis
throw e
}
partitionState.foreach { state =>
stateChangeLogger.trace(("Broker %d completed LeaderAndIsr request correlationId %d from controller %d epoch %d " +
"for the become-leader transition for partition %s")
.format(localBrokerId, correlationId, controllerId, epoch, TopicAndPartition(state._1.topic, state._1.partitionId)))
}
partitionsToMakeLeaders
}
makeFollowers方法把分区的local Replica切换为Follower副本,如果从leader副本切换成follower副本,需要首先检测新的leader副本是否存活,然后决定是否进行切换。
切换结束后,需要先停止与旧leader副本同步的Fetcher线程,然后对Log进行相应的截断处理,再启动与新Leader副本的Fetcher线程进行进行同步。
private def makeFollowers(controllerId: Int,
epoch: Int,
partitionState: Map[Partition, PartitionState],
correlationId: Int,
responseMap: mutable.Map[TopicPartition, Short],
metadataCache: MetadataCache) : Set[Partition] = {
partitionState.foreach { state =>
stateChangeLogger.trace(("Broker %d handling LeaderAndIsr request correlationId %d from controller %d epoch %d " +
"starting the become-follower transition for partition %s")
.format(localBrokerId, correlationId, controllerId, epoch, TopicAndPartition(state._1.topic, state._1.partitionId)))
}
for (partition <- partitionState.keys)
responseMap.put(new TopicPartition(partition.topic, partition.partitionId), Errors.NONE.code)
val partitionsToMakeFollower: mutable.Set[Partition] = mutable.Set()
try {
// TODO: Delete leaders from LeaderAndIsrRequest
partitionState.foreach{ case (partition, partitionStateInfo) =>
// 检测新的leader所在的broker是否存活
val newLeaderBrokerId = partitionStateInfo.leader
metadataCache.getAliveBrokers.find(_.id == newLeaderBrokerId) match {
// Only change partition state when the leader is available
case Some(leaderBroker) =>
// 调用Partition.makeFollower方法,把分区的localreplica切换成follower
if (partition.makeFollower(controllerId, partitionStateInfo, correlationId))
partitionsToMakeFollower += partition
else
stateChangeLogger.info(("Broker %d skipped the become-follower state change after marking its partition as follower with correlation id %d from " +
"controller %d epoch %d for partition [%s,%d] since the new leader %d is the same as the old leader")
.format(localBrokerId, correlationId, controllerId, partitionStateInfo.controllerEpoch,
partition.topic, partition.partitionId, newLeaderBrokerId))
case None =>
// The leader broker should always be present in the metadata cache.
// If not, we should record the error message and abort the transition process for this partition
stateChangeLogger.error(("Broker %d received LeaderAndIsrRequest with correlation id %d from controller" +
" %d epoch %d for partition [%s,%d] but cannot become follower since the new leader %d is unavailable.")
.format(localBrokerId, correlationId, controllerId, partitionStateInfo.controllerEpoch,
partition.topic, partition.partitionId, newLeaderBrokerId))
// Create the local replica even if the leader is unavailable. This is required to ensure that we include
// the partition's high watermark in the checkpoint file (see KAFKA-1647)
// 即使是Leader副本所在的Broker不可用,也要创建Local Replica,主要是为了在checkpoint文件中记录这个分区的HW
partition.getOrCreateReplica()
}
}
// 停止与旧leader同步的fetch线程
replicaFetcherManager.removeFetcherForPartitions(partitionsToMakeFollower.map(new TopicAndPartition(_)))
partitionsToMakeFollower.foreach { partition =>
stateChangeLogger.trace(("Broker %d stopped fetchers as part of become-follower request from controller " +
"%d epoch %d with correlation id %d for partition %s")
.format(localBrokerId, controllerId, epoch, correlationId, TopicAndPartition(partition.topic, partition.partitionId)))
}
// 由于leader副本已经发生变化,所以新旧leader副本在HW~LEO之间的消息可能是不一致的,但是HW之前的消息是一致的,所以把消息截取到HW
logManager.truncateTo(partitionsToMakeFollower.map(partition => (new TopicAndPartition(partition), partition.getOrCreateReplica().highWatermark.messageOffset)).toMap)
// 完成分区相关的DelayedOperation
partitionsToMakeFollower.foreach { partition =>
val topicPartitionOperationKey = new TopicPartitionOperationKey(partition.topic, partition.partitionId)
tryCompleteDelayedProduce(topicPartitionOperationKey)
tryCompleteDelayedFetch(topicPartitionOperationKey)
}
partitionsToMakeFollower.foreach { partition =>
stateChangeLogger.trace(("Broker %d truncated logs and checkpointed recovery boundaries for partition [%s,%d] as part of " +
"become-follower request with correlation id %d from controller %d epoch %d").format(localBrokerId,
partition.topic, partition.partitionId, correlationId, controllerId, epoch))
}
if (isShuttingDown.get()) { // 检测ReplicaManager的运行状态
partitionsToMakeFollower.foreach { partition =>
stateChangeLogger.trace(("Broker %d skipped the adding-fetcher step of the become-follower state change with correlation id %d from " +
"controller %d epoch %d for partition [%s,%d] since it is shutting down").format(localBrokerId, correlationId,
controllerId, epoch, partition.topic, partition.partitionId))
}
}
else {
// we do not need to check if the leader exists again since this has been done at the beginning of this process
// 重新开启与新leader副本同步的fetcher线程
val partitionsToMakeFollowerWithLeaderAndOffset = partitionsToMakeFollower.map(partition =>
new TopicAndPartition(partition) -> BrokerAndInitialOffset(
metadataCache.getAliveBrokers.find(_.id == partition.leaderReplicaIdOpt.get).get.getBrokerEndPoint(config.interBrokerSecurityProtocol),
partition.getReplica().get.logEndOffset.messageOffset)).toMap
replicaFetcherManager.addFetcherForPartitions(partitionsToMakeFollowerWithLeaderAndOffset)
partitionsToMakeFollower.foreach { partition =>
stateChangeLogger.trace(("Broker %d started fetcher to new leader as part of become-follower request from controller " +
"%d epoch %d with correlation id %d for partition [%s,%d]")
.format(localBrokerId, controllerId, epoch, correlationId, partition.topic, partition.partitionId))
}
}
} catch {
case e: Throwable =>
val errorMsg = ("Error on broker %d while processing LeaderAndIsr request with correlationId %d received from controller %d " +
"epoch %d").format(localBrokerId, correlationId, controllerId, epoch)
stateChangeLogger.error(errorMsg, e)
// Re-throw the exception for it to be caught in KafkaApis
throw e
}
partitionState.foreach { state =>
stateChangeLogger.trace(("Broker %d completed LeaderAndIsr request correlationId %d from controller %d epoch %d " +
"for the become-follower transition for partition %s")
.format(localBrokerId, correlationId, controllerId, epoch, TopicAndPartition(state._1.topic, state._1.partitionId)))
}
partitionsToMakeFollower
}