作者 | 吴邪 大数据4年从业经验,目前就职于广州一家互联网公司,负责大数据基础平台自研、离线计算&实时计算研究
编辑 | auroral-L
全文共6710字,预计阅读35分钟。
第四章 走进Kafka 服务端(下)
1. 数据存储
2. 副本数据同步
3. 小结
前面分析服务端初始化流程以及服务端接收并处理生产者请求,下面我们接着分析剩下的几个核心点,如何存储数据以及副本数据同步。
1. 数据存储
上一节在分析 KafkaApis 这个类的 handle()方法是其实已经提到了数据的存储了,如下,其中 appendMessages()方法主要用于追加生产请求的消息数据到日志系统中。
replicaManager.appendMessages(
produceRequest.timeout.toLong,
produceRequest.acks,
internalTopicsAllowed,
authorizedMessagesPerPartition,
sendResponseCallback)
我们接着看appendMessages()的源码。
/**
* Append messages to leader replicas of the partition, and wait for them to be replicated to other replicas;
* the callback function will be triggered either when timeout or the required acks are satisfied
*/
def appendMessages(timeout: Long,
requiredAcks: Short,
internalTopicsAllowed: Boolean,
messagesPerPartition: Map[TopicPartition, MessageSet],
responseCallback: Map[TopicPartition, PartitionResponse] => Unit) {
//acks应该就是判断传过来的acks的参数是否有效,-1,0,1
if (isValidRequiredAcks(requiredAcks)) {
val sTime = SystemTime.milliseconds
//把数据追加到本地日志里面
//localProduceResults 这个就是服务端写完消息以后的处理的结果。
val localProduceResults = appendToLocalLog(internalTopicsAllowed, messagesPerPartition, requiredAcks)
debug("Produce to local log in %d ms".format(SystemTime.milliseconds - sTime))
//根据写日志返回来的结果,去封装返回客户端的响应。
//将数据先写入了leader partition
val produceStatus = localProduceResults.map { case (topicPartition, result) =>
topicPartition ->
ProducePartitionStatus(
result.info.lastOffset + 1, // required offset
new PartitionResponse(result.errorCode, result.info.firstOffset, result.info.logAppendTime)) // response status
}
//acks = -1
if (delayedRequestRequired(requiredAcks, messagesPerPartition, localProduceResults)) {
// create delayed produce operation
val produceMetadata = ProduceMetadata(requiredAcks, produceStatus)
val delayedProduce = new DelayedProduce(timeout, produceMetadata, this, responseCallback)
// create a list of (topic, partition) pairs to use as keys for this delayed produce operation
val producerRequestKeys = messagesPerPartition.keys.map(new TopicPartitionOperationKey(_)).toSeq
// try to complete the request immediately, otherwise put it into the purgatory
// this is because while the delayed produce operation is being created, new
// requests may arrive and hence make this operation completable.
delayedProducePurgatory.tryCompleteElseWatch(delayedProduce, producerRequestKeys)
} else {
// we can respond immediately
val produceResponseStatus = produceStatus.mapValues(status => status.responseStatus)
responseCallback(produceResponseStatus)
}
} else {
// If required.acks is outside accepted range, something is wrong with the client
// Just return an error and don't handle the request at all
val responseStatus = messagesPerPartition.map {
case (topicAndPartition, messageSet) =>
// ACK参数不合法,返回错误信息,错误码为INVALID_REQUIRED_ACKS
topicAndPartition -> new PartitionResponse(Errors.INVALID_REQUIRED_ACKS.code,
LogAppendInfo.UnknownLogAppendInfo.firstOffset, Message.NoTimestamp)
}
//最终调用回调函数
//就是靠这个调用回调函数给客户端返回响应结果。
responseCallback(responseStatus)
}
}
以上可以看到服务端会先将数据追加到日志,在本地存一份,先写入 leader partition,然后根据 ack 的设置调用回调函数将响应结果返回给客户端。
将消息追加到 Log 中:
/**
* Append the messages to the local replica logs
*/
private def appendToLocalLog(internalTopicsAllowed: Boolean,
messagesPerPartition: Map[TopicPartition, MessageSet],
requiredAcks: Short): Map[TopicPartition, LogAppendResult] = {
trace("Append [%s] to local log ".format(messagesPerPartition))
//遍历每个分区
messagesPerPartition.map { case (topicPartition, messages) =>
BrokerTopicStats.getBrokerTopicStats(topicPartition.topic).totalProduceRequestRate.mark()
BrokerTopicStats.getBrokerAllTopicsStats().totalProduceRequestRate.mark()
// reject appending to internal topics if it is not allowed
//过滤掉内部 topic,不允许添加
if (Topic.isInternal(topicPartition.topic) && !internalTopicsAllowed) {
(topicPartition, LogAppendResult(
LogAppendInfo.UnknownLogAppendInfo,
Some(new InvalidTopicException("Cannot append to internal topic %s".format(topicPartition.topic)))))
} else {
try {
//根据 topic 和 partition 找到对应写数据的分区
val partitionOpt = getPartition(topicPartition.topic, topicPartition.partition)
val info = partitionOpt match {
case Some(partition) =>
//把数据写到leader partition里面
partition.appendMessagesToLeader(messages.asInstanceOf[ByteBufferMessageSet], requiredAcks)
case None => throw new UnknownTopicOrPartitionException("Partition %s doesn't exist on %d"
.format(topicPartition, localBrokerId))
}
val numAppendedMessages =
if (info.firstOffset == -1L || info.lastOffset == -1L)
0
else
info.lastOffset - info.firstOffset + 1
// update stats for successfully appended bytes and messages as bytesInRate and messageInRate
BrokerTopicStats.getBrokerTopicStats(topicPartition.topic).bytesInRate.mark(messages.sizeInBytes)
BrokerTopicStats.getBrokerAllTopicsStats.bytesInRate.mark(messages.sizeInBytes)
BrokerTopicStats.getBrokerTopicStats(topicPartition.topic).messagesInRate.mark(numAppendedMessages)
BrokerTopicStats.getBrokerAllTopicsStats.messagesInRate.mark(numAppendedMessages)
trace("%d bytes written to log %s-%d beginning at offset %d and ending at offset %d"
.format(messages.sizeInBytes, topicPartition.topic, topicPartition.partition, info.firstOffset, info.lastOffset))
(topicPartition, LogAppendResult(info))
} catch {
// NOTE: Failed produce requests metric is not incremented for known exceptions
// it is supposed to indicate un-expected failures of a broker in handling a produce request
case e: KafkaStorageException =>
fatal("Halting due to unrecoverable I/O error while handling produce request: ", e)
Runtime.getRuntime.halt(1)
(topicPartition, null)
case e@ (_: UnknownTopicOrPartitionException |
_: NotLeaderForPartitionException |
_: RecordTooLargeException |
_: RecordBatchTooLargeException |
_: CorruptRecordException |
_: InvalidMessageException |
_: InvalidTimestampException) =>
(topicPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, Some(e)))
case t: Throwable =>
BrokerTopicStats.getBrokerTopicStats(topicPartition.topic).failedProduceRequestRate.mark()
BrokerTopicStats.getBrokerAllTopicsStats.failedProduceRequestRate.mark()
error("Error processing append operation on partition %s".format(topicPartition), t)
(topicPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, Some(t)))
}
}
}
}
将数据写入 leader partition:
def appendMessagesToLeader(messages: ByteBufferMessageSet, requiredAcks: Int = 0) = {
val (info, leaderHWIncremented) = inReadLock(leaderIsrUpdateLock) {
// 获取leader partition
val leaderReplicaOpt = leaderReplicaIfLocal()
leaderReplicaOpt match {
//如果存在leader partition
case Some(leaderReplica) =>
//获取到log对象。
val log = leaderReplica.log.get
//获取正在同步中的副本
val minIsr = log.config.minInSyncReplicas
val inSyncSize = inSyncReplicas.size
// Avoid writing to leader if there are not enough insync replicas to make it safe
if (inSyncSize < minIsr && requiredAcks == -1) {
throw new NotEnoughReplicasException("Number of insync replicas for partition [%s,%d] is [%d], below required minimum [%d]"
.format(topic, partitionId, inSyncSize, minIsr))
}
//将 log对象的记录进行追加,有兴趣的同学可以看一下 append 方法,主要做了数据校验
//offset 分配,拿到合法有效的数据,找到可用的 segment,将数据写入segment 中
//更新 LEO,将数据刷写到磁盘中去
val info = log.append(messages, assignOffsets = true)
// probably unblock some follower fetch requests since log end offset has been updated
replicaManager.tryCompleteDelayedFetch(new TopicPartitionOperationKey(this.topic, this.partitionId))
// we may need to increment high watermark since ISR could be down to 1
(info, maybeIncrementLeaderHW(leaderReplica))
case None =>
throw new NotLeaderForPartitionException("Leader not local for partition [%s,%d] on broker %d"
.format(topic, partitionId, localBrokerId))
}
// some delayed operations may be unblocked after HW changed
if (leaderHWIncremented)
tryCompleteDelayedRequests()
info
}
最后调用回调方法responseCallback()响应消息给客户端。
2. 副本数据同步
先来看下Kafka副本同步涉及的相关术语,便于后续描述和理解。
• ISR:in-sync replics,每个分区(Partition)中同步的副本列表。
• Hight Watermark:副本水位值,表示分区中最新一条已提交(Committed)的消息的Offset。
• LEO:Log End Offset,Leader中最新消息的Offset。
• Committed Message:已提交消息,已经被所有ISR同步的消息。
• Lagging Message:没有达到所有ISR同步的消息。
上面服务端将数据写入 leader partition 之后,接下来需要将数据同步到 follower 中。
case ApiKeys.FETCH => handleFetchRequest(request)
/**
* Handle a fetch request
*/
def handleFetchRequest(request: RequestChannel.Request) {
//获取到请求
val fetchRequest = request.requestObj.asInstanceOf[FetchRequest]
val (existingAndAuthorizedForDescribeTopics, nonExistingOrUnauthorizedForDescribeTopics) = fetchRequest.requestInfo.partition {
case (topicAndPartition, _) => authorize(request.session, Describe, new Resource(auth.Topic, topicAndPartition.topic)) && metadataCache.contains(topicAndPartition.topic)
}
val (authorizedRequestInfo, unauthorizedForReadRequestInfo) = existingAndAuthorizedForDescribeTopics.partition {
case (topicAndPartition, _) => authorize(request.session, Read, new Resource(auth.Topic, topicAndPartition.topic))
}
val nonExistingOrUnauthorizedForDescribePartitionData = nonExistingOrUnauthorizedForDescribeTopics.map { case (tp, _) =>
(tp, FetchResponsePartitionData(Errors.UNKNOWN_TOPIC_OR_PARTITION.code, -1, MessageSet.Empty))
}
val unauthorizedForReadPartitionData = unauthorizedForReadRequestInfo.map { case (tp, _) =>
(tp, FetchResponsePartitionData(Errors.TOPIC_AUTHORIZATION_FAILED.code, -1, MessageSet.Empty))
}
// the callback for sending a fetch response
def sendResponseCallback(responsePartitionData: Seq[(TopicAndPartition, FetchResponsePartitionData)]) {
val convertedPartitionData =
// Need to down-convert message when consumer only takes magic value 0.
if (fetchRequest.versionId <= 1) {
responsePartitionData.map { case (tp, data) =>
// We only do down-conversion when:
// 1. The message format version configured for the topic is using magic value > 0, and
// 2. The message set contains message whose magic > 0
// This is to reduce the message format conversion as much as possible. The conversion will only occur
// when new message format is used for the topic and we see an old request.
// Please note that if the message format is changed from a higher version back to lower version this
// test might break because some messages in new message format can be delivered to consumers before 0.10.0.0
// without format down conversion.
val convertedData = if (replicaManager.getMessageFormatVersion(tp).exists(_ > Message.MagicValue_V0) &&
!data.messages.isMagicValueInAllWrapperMessages(Message.MagicValue_V0)) {
trace(s"Down converting message to V0 for fetch request from ${fetchRequest.clientId}")
new FetchResponsePartitionData(data.error, data.hw, data.messages.asInstanceOf[FileMessageSet].toMessageFormat(Message.MagicValue_V0))
} else data
tp -> convertedData
}
} else responsePartitionData
val mergedPartitionData = convertedPartitionData ++ unauthorizedForReadPartitionData ++ nonExistingOrUnauthorizedForDescribePartitionData
mergedPartitionData.foreach { case (topicAndPartition, data) =>
if (data.error != Errors.NONE.code)
debug(s"Fetch request with correlation id ${fetchRequest.correlationId} from client ${fetchRequest.clientId} " +
s"on partition $topicAndPartition failed due to ${Errors.forCode(data.error).exceptionName}")
// record the bytes out metrics only when the response is being sent
BrokerTopicStats.getBrokerTopicStats(topicAndPartition.topic).bytesOutRate.mark(data.messages.sizeInBytes)
BrokerTopicStats.getBrokerAllTopicsStats().bytesOutRate.mark(data.messages.sizeInBytes)
}
def fetchResponseCallback(delayTimeMs: Int) {
trace(s"Sending fetch response to client ${fetchRequest.clientId} of " +
s"${convertedPartitionData.map { case (_, v) => v.messages.sizeInBytes }.sum} bytes")
//封装处理响应
val response = FetchResponse(fetchRequest.correlationId, mergedPartitionData.toSeq, fetchRequest.versionId, delayTimeMs)
//发送响应信息
requestChannel.sendResponse(new RequestChannel.Response(request, new FetchResponseSend(request.connectionId, response)))
}
// When this callback is triggered, the remote API call has completed
request.apiRemoteCompleteTimeMs = SystemTime.milliseconds
if (fetchRequest.isFromFollower) {
//We've already evaluated against the quota and are good to go. Just need to record it now.
val responseSize = sizeOfThrottledPartitions(fetchRequest, mergedPartitionData, quotas.leader)
quotas.leader.record(responseSize)
fetchResponseCallback(0)
} else {
val responseSize = FetchResponse.responseSize(FetchResponse.batchByTopic(mergedPartitionData),
fetchRequest.versionId)
quotas.fetch.recordAndMaybeThrottle(request.session.sanitizedUser, fetchRequest.clientId, responseSize, fetchResponseCallback)
}
}
//非正常的请求
if (authorizedRequestInfo.isEmpty)
sendResponseCallback(Seq.empty)
else {
// call the replica manager to fetch messages from the local replica
//拉取数据
replicaManager.fetchMessages(
//最大等待时间
fetchRequest.maxWait.toLong,
//副本的 id
fetchRequest.replicaId,
//最小拉取的字节数
fetchRequest.minBytes,
//最大拉取的字节数
fetchRequest.maxBytes,
fetchRequest.versionId <= 2,
authorizedRequestInfo,
replicationQuota(fetchRequest),
sendResponseCallback)
}
}
实际上ApiKeys.FETCH拉取数据的请求发生再replicaManager.fetchMessages()方法中,点进去该方法中一探究竟。
/**
* Fetch messages from the leader replica, and wait until enough data can be fetched and return;
* the callback function will be triggered either when timeout or required fetch info is satisfied
*/
def fetchMessages(timeout: Long,
replicaId: Int,
fetchMinBytes: Int,
fetchMaxBytes: Int,
hardMaxBytesLimit: Boolean,
fetchInfos: Seq[(TopicAndPartition, PartitionFetchInfo)],
quota: ReplicaQuota = UnboundedQuota,
responseCallback: Seq[(TopicAndPartition, FetchResponsePartitionData)] => Unit) {
val isFromFollower = replicaId >= 0
// 如果replicaId != -2,就说明是正常的拉取请求,则只能拉取Leader副本的数据
val fetchOnlyFromLeader: Boolean = replicaId != Request.DebuggingConsumerId
// 判断是否是从broker发出的拉取请求,不是的话就只能读取HighWatermark线以下的数据
val fetchOnlyCommitted: Boolean = ! Request.isValidBrokerId(replicaId)
// 从本地的磁盘里面去读取日志信息
val logReadResults = readFromLocalLog(fetchOnlyFromLeader, fetchOnlyCommitted, fetchMaxBytes, hardMaxBytesLimit,
fetchInfos, quota)
// if the fetch comes from the follower,
// update its corresponding log end offset
//检测拉取请求是否来自 follower
if(Request.isValidBrokerId(replicaId))
//leader partition维护了这个partition的所有的replica的LEO值
/**
* updateFollowerLogReadResults()方法用来处理来自Follower副本的FetchRequest请求,主要做下面4件事:
* 1. 在Leader中维护了Follower副本的各种状态,这里会更新对应Follower副本的状态,例如,更新LEO、更新lastCaughtUpTimeMsUnderlying等;
* 2. 检测是否需要对ISR集合进行扩张,如果ISR集合发生变化,则将新的ISR集合的记录保存到ZooKeeper中;
* 3. 检测是否后移Leader的HighWatermark线;
* 4. 检测delayedProducePurgatory中相关key对应的DelayedProduce,满足条件则将其执行完成。
*/
updateFollowerLogReadResults(replicaId, logReadResults)
// check if this fetch request can be satisfied right away
val logReadResultValues = logReadResults.map { case (_, v) => v }
val bytesReadable = logReadResultValues.map(_.info.messageSet.sizeInBytes).sum
val errorReadingData = logReadResultValues.foldLeft(false) ((errorIncurred, readResult) =>
errorIncurred || (readResult.errorCode != Errors.NONE.code))
// respond immediately if 1) fetch request does not want to wait
// 2) fetch request does not require any data
// 3) has enough data to respond
// 4) some error happens while reading data
//bytesReadable >= fetchMinBytes
/**
* 判断是否能够立即返回FetchResponse,下面四个条件满足任意一个就可以立即返回FetchResponse:
* 1. FetchRequest的timeout<=0,即消费者或Follower副本不希望等待;
* 2. FetchRequest没有指定要读取的分区,即fetchInfo.size <= 0;
* 3. 已经读取了足够的数据,即bytesReadable >= fetchMinBytes;
* 4. 在读取数据的过程中发生了异常,即检查errorReadingData。
if (timeout <= 0 || fetchInfos.isEmpty || bytesReadable >= fetchMinBytes || errorReadingData) {
val fetchPartitionData = logReadResults.map { case (tp, result) =>
tp -> FetchResponsePartitionData(result.errorCode, result.hw, result.info.messageSet)
}
//调用回调函数,发送响应
responseCallback(fetchPartitionData)
} else {
// construct the fetch results from the read results
// 对读取Log的结果进行转换
val fetchPartitionStatus = logReadResults.map { case (topicAndPartition, result) =>
val fetchInfo = fetchInfos.collectFirst {
case (tp, v) if tp == topicAndPartition => v
}.getOrElse(sys.error(s"Partition $topicAndPartition not found in fetchInfos"))
(topicAndPartition, FetchPartitionStatus(result.info.fetchOffsetMetadata, fetchInfo))
}
val fetchMetadata = FetchMetadata(fetchMinBytes, fetchMaxBytes, hardMaxBytesLimit, fetchOnlyFromLeader,
fetchOnlyCommitted, isFromFollower, fetchPartitionStatus)
//创建DelayedFetch对象
val delayedFetch = new DelayedFetch(timeout, fetchMetadata, this, quota, responseCallback)
// create a list of (topic, partition) pairs to use as keys for this delayed fetch operation
val delayedFetchKeys = fetchPartitionStatus.map { case (tp, _) => new TopicPartitionOperationKey(tp) }
// try to complete the request immediately, otherwise put it into the purgatory;
// this is because while the delayed fetch operation is being created, new requests
// may arrive and hence make this operation completable.
// 完成DelayedFetch,否则将DelayedFetch添加到delayedFetchPurgatory中管理
delayedFetchPurgatory.tryCompleteElseWatch(delayedFetch, delayedFetchKeys)
}
}
首先可以看到,第一步会判断 follower 副本的状态,确定拉取数据的目标,然后会先从本地的磁盘里面去读取日志信息。
/**
* Read from multiple topic partitions at the given offset up to maxSize bytes
*/
def readFromLocalLog(fetchOnlyFromLeader: Boolean,
readOnlyCommitted: Boolean,
fetchMaxBytes: Int,
hardMaxBytesLimit: Boolean,
readPartitionInfo: Seq[(TopicAndPartition, PartitionFetchInfo)],
quota: ReplicaQuota): Seq[(TopicAndPartition, LogReadResult)] = {
def read(tp: TopicAndPartition, fetchInfo: PartitionFetchInfo, limitBytes: Int, minOneMessage: Boolean): LogReadResult = {
val TopicAndPartition(topic, partition) = tp
val PartitionFetchInfo(offset, fetchSize) = fetchInfo
BrokerTopicStats.getBrokerTopicStats(topic).totalFetchRequestRate.mark()
BrokerTopicStats.getBrokerAllTopicsStats().totalFetchRequestRate.mark()
try {
trace(s"Fetching log segment for partition $tp, offset ${offset}, partition fetch size ${fetchSize}, " +
s"remaining response limit ${limitBytes}" +
(if (minOneMessage) s", ignoring response/partition size limits" else ""))
// decide whether to only fetch from leader
//获取到leader partition
val localReplica = if (fetchOnlyFromLeader)
getLeaderReplicaIfLocal(topic, partition)
else
getReplicaOrException(topic, partition)
// decide whether to only fetch committed data (i.e. messages below high watermark)
val maxOffsetOpt = if (readOnlyCommitted)
Some(localReplica.highWatermark.messageOffset)
else
None
/* Read the LogOffsetMetadata prior to performing the read from the log.
* We use the LogOffsetMetadata to determine if a particular replica is in-sync or not.
* Using the log end offset after performing the read can lead to a race condition
* where data gets appended to the log immediately after the replica has consumed from it
* This can cause a replica to always be out of sync.
*/
// 获取Replica的logEndOffset
val initialLogEndOffset = localReplica.logEndOffset
val logReadInfo = localReplica.log match {
case Some(log) =>
val adjustedFetchSize = math.min(fetchSize, limitBytes)
// Try the read first, this tells us whether we need all of adjustedFetchSize for this partition
// 从Replica的Log中读取数据,注意此时读取的最大offset由maxOffsetOpt控制了
val fetch = log.read(offset, adjustedFetchSize, maxOffsetOpt, minOneMessage)
// If the partition is marked as throttled, and we are over-quota then exclude it
if (quota.isThrottled(tp) && quota.isQuotaExceeded)
FetchDataInfo(fetch.fetchOffsetMetadata, MessageSet.Empty)
// For FetchRequest version 3, we replace incomplete message sets with an empty one as consumers can make
// progress in such cases and don't need to report a `RecordTooLargeException`
else if (!hardMaxBytesLimit && fetch.firstMessageSetIncomplete)
FetchDataInfo(fetch.fetchOffsetMetadata, MessageSet.Empty)
else fetch
case None =>
error(s"Leader for partition $tp does not have a local log")
FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty)
}
// 检查是否读到了Log的末端
val readToEndOfLog = initialLogEndOffset.messageOffset - logReadInfo.fetchOffsetMetadata.messageOffset <= 0
// 将读取到的数据构造为LogReadResult对象
LogReadResult(logReadInfo, localReplica.highWatermark.messageOffset, fetchSize, readToEndOfLog, None)
} catch {
// NOTE: Failed fetch requests metric is not incremented for known exceptions since it
// is supposed to indicate un-expected failure of a broker in handling a fetch request
case e@ (_: UnknownTopicOrPartitionException |
_: NotLeaderForPartitionException |
_: ReplicaNotAvailableException |
_: OffsetOutOfRangeException) =>
LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, false, Some(e))
case e: Throwable =>
BrokerTopicStats.getBrokerTopicStats(topic).failedFetchRequestRate.mark()
BrokerTopicStats.getBrokerAllTopicsStats().failedFetchRequestRate.mark()
error(s"Error processing fetch operation on partition ${tp}, offset $offset", e)
LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, false, Some(e))
}
}
var limitBytes = fetchMaxBytes
val result = new mutable.ArrayBuffer[(TopicAndPartition, LogReadResult)]
var minOneMessage = !hardMaxBytesLimit
//一个分区一个分区去读取的
readPartitionInfo.foreach { case (tp, fetchInfo) =>
//调用read方法
val readResult = read(tp, fetchInfo, limitBytes, minOneMessage)
val messageSetSize = readResult.info.messageSet.sizeInBytes
// Once we read from a non-empty partition, we stop ignoring request and partition level size limits
if (messageSetSize > 0)
minOneMessage = false
limitBytes = math.max(0, limitBytes - messageSetSize)
result += (tp -> readResult)
}
result
}
readFromLocalLog()方法首先会通过传入的fetchOnlyFromLeader和readOnlyCommitted参数确定拉取分区和可拉取消息的最大偏移量,然后通过分区的Log对象的read(startOffset: Long, maxLength: Int, maxOffset: Option[Long] = None):FetchDataInfo方法读取数据,读取的数据会被包装为LogReadResult对象进行返回。接下来的操作就交给上层的fetchMessages()方法了,如果发起拉取请求的是Follower副本,会调用updateReplicaLogReadResult()方法,这部分的内容会相对比较多,源码如下:
/**
* Update the log end offset of a certain replica of this partition
*/
def updateReplicaLogReadResult(replicaId: Int, logReadResult: LogReadResult) {
getReplica(replicaId) match {
case Some(replica) =>
//更新replica的LEO的值
replica.updateLogReadResult(logReadResult)
// check if we need to expand ISR to include this replica
// if it is not in the ISR yet
/**
* 检查是否应该将当前副本加入In-Sync集合;有可能当前副本由于滞后被In-Sync集合排除了,
* 此时可以执行一次检测,如果满足条件就将当前副本重新添加到In-Sync集合中
*/
maybeExpandIsr(replicaId)
debug("Recorded replica %d log end offset (LEO) position %d for partition %s."
.format(replicaId,
logReadResult.info.fetchOffsetMetadata.messageOffset,
TopicAndPartition(topic, partitionId)))
case None =>
throw new NotAssignedReplicaException(("Leader %d failed to record follower %d's position %d since the replica" +
" is not recognized to be one of the assigned replicas %s for partition %s.")
.format(localBrokerId,
replicaId,
logReadResult.info.fetchOffsetMetadata.messageOffset,
assignedReplicas().map(_.brokerId).mkString(","),
TopicAndPartition(topic, partitionId)))
}
}
我更新完 replica 的 LEO 之后,会针对 replica 的状态决定是否将 replica 放入到 ISR 中,因为 replica 是存在滞后的情况,如果滞后超过阈值,则排除在 ISR 之外,我们看 maybeExpandIsr()这个方法是怎么处理的。
/**
* Check and maybe expand the ISR of the partition.
*
* This function can be triggered when a replica's LEO has incremented
*/
def maybeExpandIsr(replicaId: Int) {
val leaderHWIncremented = inWriteLock(leaderIsrUpdateLock) {
// check if this replica needs to be added to the ISR
leaderReplicaIfLocal() match {
case Some(leaderReplica) =>
//获取到所有的replica
val replica = getReplica(replicaId).get
//获取到leader partition的HW的值
val leaderHW = leaderReplica.highWatermark
//判断一下是否要更新ISR列表
/**
* 同时满足以下三个条件则可以将当前副本添加到In-Sync集合中:
* 1. 当前In-Sync集合不包含当前Replica副本;
* 2. 当前副本是否是assignedReplicas副本(AR)之一;
* 3. 当前副本的LEO大于等于Leader副本的HighWatermark;
*/
if(!inSyncReplicas.contains(replica) &&
assignedReplicas.map(_.brokerId).contains(replicaId) &&
replica.logEndOffset.offsetDiff(leaderHW) >= 0) {
val newInSyncReplicas = inSyncReplicas + replica
info("Expanding ISR for partition [%s,%d] from %s to %s"
.format(topic, partitionId, inSyncReplicas.map(_.brokerId).mkString(","),
newInSyncReplicas.map(_.brokerId).mkString(",")))
// update ISR in ZK and cache
//就要更新zk中的ISR列表和缓存
updateIsr(newInSyncReplicas)
replicaManager.isrExpandRate.mark()
}
// check if the HW of the partition can now be incremented
// since the replica maybe now be in the ISR and its LEO has just incremented
// 检查是否可以更新HW的值
maybeIncrementLeaderHW(leaderReplica)
case None => false // nothing to do if no longer leader
}
}
// some delayed operations may be unblocked after HW changed
if (leaderHWIncremented)
//如果 HW 更新了
tryCompleteDelayedRequests()
}
关于 maybeIncrementLeaderHW()实现如下:
private def maybeIncrementLeaderHW(leaderReplica: Replica): Boolean = {
//获取到当前partition的所有replica的LEO的值
val allLogEndOffsets = inSyncReplicas.map(_.logEndOffset)
//从里面取一个最小值,作为HW值
val newHighWatermark = allLogEndOffsets.min(new LogOffsetMetadata.OffsetOrdering)
//获取到老的HW的值
val oldHighWatermark = leaderReplica.highWatermark
//如果新的 HW 的值大于老的HW的值,就用新的 HW 值作为 leader partition 的 HW 的值
if (oldHighWatermark.messageOffset < newHighWatermark.messageOffset || oldHighWatermark.onOlderSegment(newHighWatermark)) {
leaderReplica.highWatermark = newHighWatermark
debug("High watermark for partition [%s,%d] updated to %s".format(topic, partitionId, newHighWatermark))
true
} else {
debug("Skipping update high watermark since Old hw %s is larger than new hw %s for partition [%s,%d]. All leo's are %s"
.format(oldHighWatermark, newHighWatermark, topic, partitionId, allLogEndOffsets.mkString(",")))
false
}
}
如果 HW 发生了更新,尝试完成DelayedProduce和DelayedFetch延迟操作 。
/**
* Try to complete any pending requests. This should be called without holding the leaderIsrUpdateLock.
*/
private def tryCompleteDelayedRequests() {
val requestKey = new TopicPartitionOperationKey(this.topic, this.partitionId)
// 尝试完成消息拉取请求
replicaManager.tryCompleteDelayedFetch(requestKey)
// 尝试完成消息写入请求
replicaManager.tryCompleteDelayedProduce(requestKey)
}
这部分代码会尝试完成时间轮中的相关Delay对象,这里就不展开了。
到这里我们只是完成了 fetchMessages()方法中的副本拉取数据的一部分,内容比较多也比较杂乱,最好还是能把源码 load 下来深入地分析。在fetchMessages()方法中,只要满足了四个条件中的一个就可以返回 FetchResponse 对象。
3. 总结
关于 kafka 数据存储以及副本数据同步实际上是两个非常大的主体,包含的内容非常多,以上的篇幅远远无法涵盖所有,很多东西无法在文章中体现,后面可能会针对一些具体的点详细分析,比如 LogSegment、Log、Offset、LogManager 和 ReplicaManager 的要点以及核心方法等等,敬请期待。