追加/读取消息
当local Replica切换成leader副本后,就可以处理生产者发送的ProducerRequest,把消息写到Log中。
private def appendToLocalLog(internalTopicsAllowed: Boolean,
messagesPerPartition: Map[TopicPartition, MessageSet],
requiredAcks: Short): Map[TopicPartition, LogAppendResult] = {
trace("Append [%s] to local log ".format(messagesPerPartition))
messagesPerPartition.map { //对消息进行迭代
case (topicPartition, messages) =>
// 检测目标Topic是否是Kafka的内部Topic(__consumer_offsets),如果是内部的Topic,则根据internalTopicsAllowed决定是否可以向内部Topic写入消息
BrokerTopicStats.getBrokerTopicStats(topicPartition.topic).totalProduceRequestRate.mark()
BrokerTopicStats.getBrokerAllTopicsStats().totalProduceRequestRate.mark()
// reject appending to internal topics if it is not allowed
if (Topic.isInternal(topicPartition.topic) && !internalTopicsAllowed) {
(topicPartition, LogAppendResult(
LogAppendInfo.UnknownLogAppendInfo,
Some(new InvalidTopicException("Cannot append to internal topic %s".format(topicPartition.topic)))))
} else {
try {
// 从allPartitions中获取Partition对象
val partitionOpt = getPartition(topicPartition.topic, topicPartition.partition)
val info = partitionOpt match {
case Some(partition) =>
//Partition.appendMessagesToLeader方法写入log中
partition.appendMessagesToLeader(messages.asInstanceOf[ByteBufferMessageSet], requiredAcks)
case None => throw new UnknownTopicOrPartitionException("Partition %s doesn't exist on %d"
.format(topicPartition, localBrokerId))
}
val numAppendedMessages =
if (info.firstOffset == -1L || info.lastOffset == -1L)
0
else
info.lastOffset - info.firstOffset + 1
// update stats for successfully appended bytes and messages as bytesInRate and messageInRate
// 统计追加消息的数量
BrokerTopicStats.getBrokerTopicStats(topicPartition.topic).bytesInRate.mark(messages.sizeInBytes)
BrokerTopicStats.getBrokerAllTopicsStats.bytesInRate.mark(messages.sizeInBytes)
BrokerTopicStats.getBrokerTopicStats(topicPartition.topic).messagesInRate.mark(numAppendedMessages)
BrokerTopicStats.getBrokerAllTopicsStats.messagesInRate.mark(numAppendedMessages)
trace("%d bytes written to log %s-%d beginning at offset %d and ending at offset %d"
.format(messages.sizeInBytes, topicPartition.topic, topicPartition.partition, info.firstOffset, info.lastOffset))
(topicPartition, LogAppendResult(info))
} catch {
// NOTE: Failed produce requests metric is not incremented for known exceptions
// it is supposed to indicate un-expected failures of a broker in handling a produce request
case e: KafkaStorageException =>
fatal("Halting due to unrecoverable I/O error while handling produce request: ", e)
Runtime.getRuntime.halt(1)
(topicPartition, null)
case e@ (_: UnknownTopicOrPartitionException |
_: NotLeaderForPartitionException |
_: RecordTooLargeException |
_: RecordBatchTooLargeException |
_: CorruptRecordException |
_: InvalidMessageException |
_: InvalidTimestampException) =>
(topicPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, Some(e)))
case t: Throwable =>
BrokerTopicStats.getBrokerTopicStats(topicPartition.topic).failedProduceRequestRate.mark()
BrokerTopicStats.getBrokerAllTopicsStats.failedProduceRequestRate.mark()
error("Error processing append operation on partition %s".format(topicPartition), t)
(topicPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, Some(t)))
}
}
}
}
Leader副本的另外一个功能是处理Fetch请求。
def readFromLocalLog(fetchOnlyFromLeader: Boolean, //是否只读取leader的消息,只要在debug下为false
readOnlyCommitted: Boolean, //是否只读HW之前的消息
readPartitionInfo: Map[TopicAndPartition, PartitionFetchInfo]): Map[TopicAndPartition, LogReadResult] = {
readPartitionInfo.map { case (TopicAndPartition(topic, partition), PartitionFetchInfo(offset, fetchSize)) =>
BrokerTopicStats.getBrokerTopicStats(topic).totalFetchRequestRate.mark()
BrokerTopicStats.getBrokerAllTopicsStats().totalFetchRequestRate.mark()
val partitionDataAndOffsetInfo =
try {
trace("Fetching log segment for topic %s, partition %d, offset %d, size %d".format(topic, partition, offset, fetchSize))
// decide whether to only fetch from leader
//获取要读取消息的副本,根据fetchOnlyFromLeader来判断是否必须为leader副本
val localReplica = if (fetchOnlyFromLeader)
getLeaderReplicaIfLocal(topic, partition)
else
getReplicaOrException(topic, partition)
// decide whether to only fetch committed data (i.e. messages below high watermark)
// 读取消息offset上限,根据readOnlyCommitted判断是否为HW
val maxOffsetOpt = if (readOnlyCommitted)
Some(localReplica.highWatermark.messageOffset)
else
None
/* Read the LogOffsetMetadata prior to performing the read from the log.
* We use the LogOffsetMetadata to determine if a particular replica is in-sync or not.
* Using the log end offset after performing the read can lead to a race condition
* where data gets appended to the log immediately after the replica has consumed from it
* This can cause a replica to always be out of sync.
*/
val initialLogEndOffset = localReplica.logEndOffset
// logReadInfo是FetchDataInfo类型对象,其中包含logOffsetMetadata和消息集messageSet
val logReadInfo = localReplica.log match {
case Some(log) =>
// 从Log中读取消息
log.read(offset, fetchSize, maxOffsetOpt)
case None =>
error("Leader for partition [%s,%d] does not have a local log".format(topic, partition))
FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty)
}
//是否已经读到Log最后一条消息
val readToEndOfLog = initialLogEndOffset.messageOffset - logReadInfo.fetchOffsetMetadata.messageOffset <= 0
// 封装成LogReadResult对象返回
LogReadResult(logReadInfo, localReplica.highWatermark.messageOffset, fetchSize, readToEndOfLog, None)
} catch {
// NOTE: Failed fetch requests metric is not incremented for known exceptions since it
// is supposed to indicate un-expected failure of a broker in handling a fetch request
case utpe: UnknownTopicOrPartitionException =>
LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, false, Some(utpe))
case nle: NotLeaderForPartitionException =>
LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, false, Some(nle))
case rnae: ReplicaNotAvailableException =>
LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, false, Some(rnae))
case oor : OffsetOutOfRangeException =>
LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, false, Some(oor))
case e: Throwable =>
BrokerTopicStats.getBrokerTopicStats(topic).failedFetchRequestRate.mark()
BrokerTopicStats.getBrokerAllTopicsStats().failedFetchRequestRate.mark()
error("Error processing fetch operation on partition [%s,%d] offset %d".format(topic, partition, offset), e)
LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, false, Some(e))
}
(TopicAndPartition(topic, partition), partitionDataAndOffsetInfo)
}
}
当ISR集合中所有副本都已经同步消息后,可以吧HW后移,针对来自follower的fetch请求多了一步处理:
private def updateFollowerLogReadResults(replicaId: Int, readResults: Map[TopicAndPartition, LogReadResult]) {
debug("Recording follower broker %d log read results: %s ".format(replicaId, readResults))
// 遍历读取Log的结果
readResults.foreach { case (topicAndPartition, readResult) =>
getPartition(topicAndPartition.topic, topicAndPartition.partition) match {
case Some(partition) =>
//调用Partition.updateReplicaLogReadResult更新follower副本的状态,尝试更新isr集合
partition.updateReplicaLogReadResult(replicaId, readResult)
// for producer requests with ack > 1, we need to check
// if they can be unblocked after some follower's log end offsets have moved
// 尝试执行DelayedProducer
tryCompleteDelayedProduce(new TopicPartitionOperationKey(topicAndPartition))
case None =>
warn("While recording the replica LEO, the partition %s hasn't been created.".format(topicAndPartition))
}
}
}
def updateReplicaLogReadResult(replicaId: Int, logReadResult: LogReadResult) {
getReplica(replicaId) match {
case Some(replica) =>
// 更新follower副本的状态
replica.updateLogReadResult(logReadResult)
// check if we need to expand ISR to include this replica
// if it is not in the ISR yet
maybeExpandIsr(replicaId)
debug("Recorded replica %d log end offset (LEO) position %d for partition %s."
.format(replicaId,
logReadResult.info.fetchOffsetMetadata.messageOffset,
TopicAndPartition(topic, partitionId)))
case None =>
throw new NotAssignedReplicaException(("Leader %d failed to record follower %d's position %d since the replica" +
" is not recognized to be one of the assigned replicas %s for partition %s.")
.format(localBrokerId,
replicaId,
logReadResult.info.fetchOffsetMetadata.messageOffset,
assignedReplicas().map(_.brokerId).mkString(","),
TopicAndPartition(topic, partitionId)))
}
}
消息同步
Follower副本与leader副本同步的功能由ReplicaFetcherManager组件实现,ReplicaFetcherManager继承了AbstractFetcherManager,AbstractFetcherManager.addFetcherForPartitions()方法让follower副本从指定的offset开始与leader副本进行同步。
def addFetcherForPartitions(partitionAndOffsets: Map[TopicAndPartition, BrokerAndInitialOffset]) {
mapLock synchronized {
val partitionsPerFetcher = partitionAndOffsets.groupBy{ case(topicAndPartition, brokerAndInitialOffset) =>
//首先通过分区所属的topic和分区编号计算对应的Fetcher线程id,然后后与Broker的网络位置信息组成Key,并进行分组。每组对应相同的Fetcher线程。注意,每个Fetcher线程只连接一个Broker,但可以为多个分区的follower副本完成同步。
BrokerAndFetcherId(brokerAndInitialOffset.broker, getFetcherId(topicAndPartition.topic, topicAndPartition.partition))}
//通过key查找对应的Fetcher线程,查找不到就创建新的Fetcher线程并启动
for ((brokerAndFetcherId, partitionAndOffsets) <- partitionsPerFetcher) {
var fetcherThread: AbstractFetcherThread = null
fetcherThreadMap.get(brokerAndFetcherId) match {
case Some(f) => fetcherThread = f //查找到Fetcher线程
case None => //查找不到Fetcher线程的情况
fetcherThread = createFetcherThread(brokerAndFetcherId.fetcherId, brokerAndFetcherId.broker)
//添加到fetcherThreadMap中管理并启动
fetcherThreadMap.put(brokerAndFetcherId, fetcherThread)
fetcherThread.start
}
// 将分区信息以及同步起始位置传递给Fetcher线程,并唤醒Fetcher线程,开始同步
fetcherThreadMap(brokerAndFetcherId).addPartitions(partitionAndOffsets.map { case (topicAndPartition, brokerAndInitOffset) =>
topicAndPartition -> brokerAndInitOffset.initOffset
})
}
}
info("Added fetcher for partitions %s".format(partitionAndOffsets.map{ case (topicAndPartition, brokerAndInitialOffset) =>
"[" + topicAndPartition + ", initOffset " + brokerAndInitialOffset.initOffset + " to broker " + brokerAndInitialOffset.broker + "] "}))
}
以上是ReplicaManager如何管理fetch线程,下面来分析fetcher线程发送FetchRequest、处理Fetchesponse的相关实现。
AbstractFetcherThread.addPartition()方法和removePartition方法对partitionMap进行增删,同时唤醒Fetcher线程进行同步。
def addPartitions(partitionAndOffsets: Map[TopicAndPartition, Long]) {
partitionMapLock.lockInterruptibly()
try {
for ((topicAndPartition, offset) <- partitionAndOffsets) {
// If the partitionMap already has the topic/partition, then do not update the map with the old offset
// 检测分区是否存在
if (!partitionMap.contains(topicAndPartition))
partitionMap.put(
topicAndPartition,
if (PartitionTopicInfo.isOffsetInvalid(offset)) new PartitionFetchState(handleOffsetOutOfRange(topicAndPartition))
else new PartitionFetchState(offset)
)}
partitionMapCond.signalAll() // 唤醒当前Fetcher线程,进行同步操作
} finally partitionMapLock.unlock()
}
def removePartitions(topicAndPartitions: Set[TopicAndPartition]) {
partitionMapLock.lockInterruptibly()
try {
topicAndPartitions.foreach { topicAndPartition =>
partitionMap.remove(topicAndPartition)
fetcherLagStats.unregister(topicAndPartition.topic, topicAndPartition.partition)
}
} finally partitionMapLock.unlock()
}
doWork方法创建请求
override def doWork() {
// 创建FetcherRquest请求
val fetchRequest = inLock(partitionMapLock) {
val fetchRequest = buildFetchRequest(partitionMap)
if (fetchRequest.isEmpty) { //没有FetchRequest,没有需要同步的分区,则退避一段时间后重试
trace("There are no active partitions. Back off for %d ms before sending a fetch request".format(fetchBackOffMs))
partitionMapCond.await(fetchBackOffMs, TimeUnit.MILLISECONDS)
}
fetchRequest
}
if (!fetchRequest.isEmpty)
// 发送FecthRequest处理
processFetchRequest(fetchRequest)
}
processFetchRequest()方法中定义了发送请求以及处理的响应的步骤。
private def processFetchRequest(fetchRequest: REQ) {
val partitionsWithError = new mutable.HashSet[TopicAndPartition]
var responseData: Map[TopicAndPartition, PD] = Map.empty
try {
trace("Issuing to broker %d of fetch request %s".format(sourceBroker.id, fetchRequest))
// 发送FetchRequest并等待FetchResponse,fetch()是抽象方法
responseData = fetch(fetchRequest)
} catch {
case t: Throwable =>
if (isRunning.get) {
warn(s"Error in fetch $fetchRequest", t)
inLock(partitionMapLock) {
partitionsWithError ++= partitionMap.keys
// there is an error occurred while fetching partitions, sleep a while
partitionMapCond.await(fetchBackOffMs, TimeUnit.MILLISECONDS)
}
}
}
fetcherStats.requestRate.mark()
// 处理FetchResponse
if (responseData.nonEmpty) {
// process fetched data
// 加锁,遍历每个TopicAndPartition对应的响应信息
inLock(partitionMapLock) {
responseData.foreach { case (topicAndPartition, partitionData) =>
val TopicAndPartition(topic, partitionId) = topicAndPartition
partitionMap.get(topicAndPartition).foreach(currentPartitionFetchState =>
// we append to the log if the current offset is defined and it is the same as the offset requested during fetch
// 发送FetchRequest到收到FetchResponse这段同步时间内,offset并未发生变化
if (fetchRequest.offset(topicAndPartition) == currentPartitionFetchState.offset) {
Errors.forCode(partitionData.errorCode) match {
case Errors.NONE =>
try { // 获取返回的消息集合
val messages = partitionData.toByteBufferMessageSet
val validBytes = messages.validBytes // 验证
//获取返回的最后一条消息的offset
val newOffset = messages.shallowIterator.toSeq.lastOption match {
case Some(m: MessageAndOffset) => m.nextOffset
case None => currentPartitionFetchState.offset
}
// 更新Fetch状态
partitionMap.put(topicAndPartition, new PartitionFetchState(newOffset))
fetcherLagStats.getAndMaybePut(topic, partitionId).lag = Math.max(0L, partitionData.highWatermark - newOffset)
fetcherStats.byteRate.mark(validBytes)
// Once we hand off the partition data to the subclass, we can't mess with it any more in this thread
// 从Leader副本获取消息集合追加到Log中
processPartitionData(topicAndPartition, currentPartitionFetchState.offset, partitionData)
} catch {
case ime: CorruptRecordException =>
// we log the error and continue. This ensures two things
// 1. If there is a corrupt message in a topic partition, it does not bring the fetcher thread down and cause other topic partition to also lag
// 2. If the message is corrupt due to a transient state in the log (truncation, partial writes can cause this), we simply continue and
// should get fixed in the subsequent fetches
logger.error("Found invalid messages during fetch for partition [" + topic + "," + partitionId + "] offset " + currentPartitionFetchState.offset + " error " + ime.getMessage)
case e: Throwable =>
throw new KafkaException("error processing data for partition [%s,%d] offset %d"
.format(topic, partitionId, currentPartitionFetchState.offset), e)
}
case Errors.OFFSET_OUT_OF_RANGE =>
// follower请求的offset超出了leader的leo
try {
val newOffset = handleOffsetOutOfRange(topicAndPartition)
partitionMap.put(topicAndPartition, new PartitionFetchState(newOffset))
error("Current offset %d for partition [%s,%d] out of range; reset offset to %d"
.format(currentPartitionFetchState.offset, topic, partitionId, newOffset))
} catch {
case e: Throwable =>
error("Error getting offset for partition [%s,%d] to broker %d".format(topic, partitionId, sourceBroker.id), e)
partitionsWithError += topicAndPartition
}
case _ =>
// 返回其他错误码,则进行收集后,有handlePartitionWithErrors()方法处理。
if (isRunning.get) {
error("Error for partition [%s,%d] to broker %d:%s".format(topic, partitionId, sourceBroker.id,
partitionData.exception.get))
partitionsWithError += topicAndPartition
}
}
})
}
}
}
if (partitionsWithError.nonEmpty) {
debug("handling partitions with error for %s".format(partitionsWithError))
// 抽象方法
handlePartitionsWithErrors(partitionsWithError)
}
}
ReplicaFetcherThread通过NetworkClientBlockingOps这个辅助类对NetWorkClient进行了封装,为NetworkClient提供了同步阻塞的使用方式。NetworkClientBlockingOps提供blockingReady和blockingSendAndReceive方法,分别是阻塞等待指定node处于ready状态,发送请求后阻塞等待。
def blockingReady(node: Node, timeout: Long)(implicit time: JTime): Boolean = {
require(timeout >=0, "timeout should be >= 0")
client.ready(node, time.milliseconds()) || pollUntil(timeout) { (_, now) =>
if (client.isReady(node, now))
true
else if (client.connectionFailed(node))
throw new IOException(s"Connection to $node failed")
else false
}
}
def blockingSendAndReceive(request: ClientRequest)(implicit time: JTime): ClientResponse = {
client.send(request, time.milliseconds())
pollContinuously { responses =>
//找到上面请求对应的响应
val response = responses.find { response =>
response.request.request.header.correlationId == request.request.header.correlationId
}
response.foreach { r =>
if (r.wasDisconnected) {
val destination = request.request.destination
throw new IOException(s"Connection to $destination was disconnected before the response was read")
}
}
response
}
}
NetworkClientBlockingOps中的pollUntil方法和pollContinuously()都是通过递归调用recursivePoll方法实现阻塞的,每次地柜都会调用NetworkClient.poll()。
private def pollUntil(timeout: Long)(predicate: (Seq[ClientResponse], Long) => Boolean)(implicit time: JTime): Boolean = {
val methodStartTime = time.milliseconds()
val timeoutExpiryTime = methodStartTime + timeout//计算超时时间
@tailrec
def recursivePoll(iterationStartTime: Long): Boolean = {
val pollTimeout = timeoutExpiryTime - iterationStartTime
val responses = client.poll(pollTimeout, iterationStartTime).asScala
//计算是否满足递归结束条件
if (predicate(responses, iterationStartTime)) true
else {
val afterPollTime = time.milliseconds()
//未超时继续递归
if (afterPollTime < timeoutExpiryTime) recursivePoll(afterPollTime)
else false
}
}
recursivePoll(methodStartTime)
}
private def pollContinuously[T](collect: Seq[ClientResponse] => Option[T])(implicit time: JTime): T = {
@tailrec
def recursivePoll: T = {
// rely on request timeout to ensure we don't block forever
// 虽然是log。maxvalue,但是clientRequest的超时时间能保证永远不阻塞
val responses = client.poll(Long.MaxValue, time.milliseconds()).asScala
collect(responses) match {
//递归结束
case Some(result) => result
//继续递归
case None => recursivePoll
}
}
recursivePoll
}
ReplicaFetcherThread对fetch()方法完全依赖于NetworkClientBlockingOps.blockingReady和blockingSendAndReceive方法。
protected def fetch(fetchRequest: FetchRequest): Map[TopicAndPartition, PartitionData] = {
val clientResponse = sendRequest(ApiKeys.FETCH, Some(fetchRequestVersion), fetchRequest.underlying)
new FetchResponse(clientResponse.responseBody).responseData.asScala.map { case (key, value) =>
TopicAndPartition(key.topic, key.partition) -> new PartitionData(value)
}
}
private def sendRequest(apiKey: ApiKeys, apiVersion: Option[Short], request: AbstractRequest): ClientResponse = {
import kafka.utils.NetworkClientBlockingOps._
val header = apiVersion.fold(networkClient.nextRequestHeader(apiKey))(networkClient.nextRequestHeader(apiKey, _))
try {
//阻塞等待node的状态变为ready,超时就异常
if (!networkClient.blockingReady(sourceNode, socketTimeout)(time))
throw new SocketTimeoutException(s"Failed to connect within $socketTimeout ms")
else {
val send = new RequestSend(sourceBroker.id.toString, header, request.toStruct)
val clientRequest = new ClientRequest(time.milliseconds(), true, send, null)
networkClient.blockingSendAndReceive(clientRequest)(time)
}
}
catch {
case e: Throwable =>
networkClient.close(sourceBroker.id.toString)
throw e
}
}
processPartitionData方法把fetch()返回的消息追加到Follower副本的Log中,并且更新follower副本的HW
def processPartitionData(topicAndPartition: TopicAndPartition, fetchOffset: Long, partitionData: PartitionData) {
try {
val TopicAndPartition(topic, partitionId) = topicAndPartition
val replica = replicaMgr.getReplica(topic, partitionId).get
val messageSet = partitionData.toByteBufferMessageSet
warnIfMessageOversized(messageSet, topicAndPartition)
if (fetchOffset != replica.logEndOffset.messageOffset)
throw new RuntimeException("Offset mismatch for partition %s: fetched offset = %d, log end offset = %d.".format(topicAndPartition, fetchOffset, replica.logEndOffset.messageOffset))
if (logger.isTraceEnabled)
trace("Follower %d has replica log end offset %d for partition %s. Received %d messages and leader hw %d"
.format(replica.brokerId, replica.logEndOffset.messageOffset, topicAndPartition, messageSet.sizeInBytes, partitionData.highWatermark))
//leader已经被消息分配了offset,follower副本不在重新分配offset
replica.log.get.append(messageSet, assignOffsets = false)
if (logger.isTraceEnabled)
trace("Follower %d has replica log end offset %d after appending %d bytes of messages for partition %s"
.format(replica.brokerId, replica.logEndOffset.messageOffset, messageSet.sizeInBytes, topicAndPartition))
val followerHighWatermark = replica.logEndOffset.messageOffset.min(partitionData.highWatermark)
// for the follower replica, we do not need to keep
// its segment base offset the physical position,
// these values will be computed upon making the leader
replica.highWatermark = new LogOffsetMetadata(followerHighWatermark)
if (logger.isTraceEnabled)
trace("Follower %d set replica high watermark for partition [%s,%d] to %s"
.format(replica.brokerId, topic, partitionId, followerHighWatermark))
} catch {
case e: KafkaStorageException =>
fatal(s"Disk error while replicating data for $topicAndPartition", e)
Runtime.getRuntime.halt(1)
}
}
关闭副本。
def stopReplicas(stopReplicaRequest: StopReplicaRequest): (mutable.Map[TopicPartition, Short], Short) = {
replicaStateChangeLock synchronized {
val responseMap = new collection.mutable.HashMap[TopicPartition, Short]
// 检测请求中的controllerEpoc的值
if(stopReplicaRequest.controllerEpoch() < controllerEpoch) {
stateChangeLogger.warn("Broker %d received stop replica request from an old controller epoch %d. Latest known controller epoch is %d"
.format(localBrokerId, stopReplicaRequest.controllerEpoch, controllerEpoch))
(responseMap, Errors.STALE_CONTROLLER_EPOCH.code)
} else {
val partitions = stopReplicaRequest.partitions.asScala
controllerEpoch = stopReplicaRequest.controllerEpoch
// First stop fetchers for all partitions, then stop the corresponding replicas
// 停止fetch操作
replicaFetcherManager.removeFetcherForPartitions(partitions.map(r => TopicAndPartition(r.topic, r.partition)))
for(topicPartition <- partitions){
val errorCode = stopReplica(topicPartition.topic, topicPartition.partition, stopReplicaRequest.deletePartitions)
responseMap.put(topicPartition, errorCode)
}
(responseMap, Errors.NONE.code)
}
}
}
def stopReplica(topic: String, partitionId: Int, deletePartition: Boolean): Short = {
stateChangeLogger.trace("Broker %d handling stop replica (delete=%s) for partition [%s,%d]".format(localBrokerId,
deletePartition.toString, topic, partitionId))
val errorCode = Errors.NONE.code
getPartition(topic, partitionId) match {
case Some(partition) =>
if(deletePartition) {
val removedPartition = allPartitions.remove((topic, partitionId))
//当deletePartition为ture时,才会真正删除该分区对应的副本和log
if (removedPartition != null) {
removedPartition.delete() // this will delete the local log
val topicHasPartitions = allPartitions.keys.exists { case (t, _) => topic == t }
if (!topicHasPartitions)
BrokerTopicStats.removeMetrics(topic)
}
}
case None =>
// Delete log and corresponding folders in case replica manager doesn't hold them anymore.
// This could happen when topic is being deleted while broker is down and recovers.
// 不在在分区对象,直接删除log
if(deletePartition) {
val topicAndPartition = TopicAndPartition(topic, partitionId)
if(logManager.getLog(topicAndPartition).isDefined) {
logManager.deleteLog(topicAndPartition)
}
}
stateChangeLogger.trace("Broker %d ignoring stop replica (delete=%s) for partition [%s,%d] as replica doesn't exist on broker"
.format(localBrokerId, deletePartition, topic, partitionId))
}
stateChangeLogger.trace("Broker %d finished handling stop replica (delete=%s) for partition [%s,%d]"
.format(localBrokerId, deletePartition, topic, partitionId))
errorCode
}