副本写入,是指向副本底层日志写入消息。在 ReplicaManager 类中,实现副本写入的方法叫 appendRecords。放眼整个 Kafka 源码世界,需要副本写入的场景有 4 个。
- 生产者向 Leader 副本写入消息
- Follower 副本拉取消息后写入副本
- 消费者组写入组信息
- 事务管理器写入事务信息(包括事务标记、事务元数据等)
下面直接看ReplicaManager 类中实现副本写入的方法 appendRecords。
def appendRecords(timeout: Long, // 请求处理超时时间
requiredAcks: Short, // 请求acks设置
internalTopicsAllowed: Boolean, // 是否允许写入内部主题
origin: AppendOrigin, // 写入方来源:副本、coordinator、客户端
entriesPerPartition: Map[TopicPartition, MemoryRecords], // 待写入消息
responseCallback: Map[TopicPartition, PartitionResponse] => Unit, // 回调逻辑
delayedProduceLock: Option[Lock] = None, // 专门用来保护消费者组操作线程安全的锁对象,在其他场景中用不到。
recordConversionStatsCallback: Map[TopicPartition, RecordConversionStats] => Unit = _ => () // 消息格式转换操作的回调统计逻辑,主要用于统计消息格式转换操作过程中的一些数据指标
): Unit = {
// requiredAcks合法取值是-1,0,1,否则视为非法
if (isValidRequiredAcks(requiredAcks)) {
val sTime = time.milliseconds
// 调用appendToLocalLog方法写入消息集合到本地日志
val localProduceResults = appendToLocalLog(internalTopicsAllowed = internalTopicsAllowed,
origin, entriesPerPartition, requiredAcks)
debug("Produce to local log in %d ms".format(time.milliseconds - sTime))
val produceStatus = localProduceResults.map { case (topicPartition, result) =>
topicPartition ->
ProducePartitionStatus(
// 设置下一条待写入消息的位移值
result.info.lastOffset + 1, // required offset
// 构建PartitionResponse封装写入结果
new PartitionResponse(result.error, result.info.firstOffset.getOrElse(-1), result.info.logAppendTime,
result.info.logStartOffset, result.info.recordErrors.asJava, result.info.errorMessage)) // response status
}
// 尝试更新消息格式转换的指标数据
recordConversionStatsCallback(localProduceResults.map { case (k, v) => k -> v.info.recordConversionStats })
// 需要等待其他副本完成写入
if (delayedProduceRequestRequired(requiredAcks, entriesPerPartition, localProduceResults)) {
// create delayed produce operation
val produceMetadata = ProduceMetadata(requiredAcks, produceStatus)
// 创建DelayedProduce延时请求对象
val delayedProduce = new DelayedProduce(timeout, produceMetadata, this, responseCallback, delayedProduceLock)
// create a list of (topic, partition) pairs to use as keys for this delayed produce operation
val producerRequestKeys = entriesPerPartition.keys.map(TopicPartitionOperationKey(_)).toSeq
// try to complete the request immediately, otherwise put it into the purgatory
// this is because while the delayed produce operation is being created, new
// requests may arrive and hence make this operation completable.
// 再一次尝试完成该延时请求
// 如果暂时无法完成,则将对象放入到相应的Purgatory中等待后续处理
delayedProducePurgatory.tryCompleteElseWatch(delayedProduce, producerRequestKeys)
} else {
// we can respond immediately
// 无需等待其他副本写入完成,可以立即发送Response
val produceResponseStatus = produceStatus.map { case (k, status) => k -> status.responseStatus }
// 调用回调逻辑然后返回即可
responseCallback(produceResponseStatus)
}
} else {
// 如果requiredAcks值不合法
// If required.acks is outside accepted range, something is wrong with the client
// Just return an error and don't handle the request at all
val responseStatus = entriesPerPartition.map { case (topicPartition, _) =>
topicPartition -> new PartitionResponse(Errors.INVALID_REQUIRED_ACKS,
LogAppendInfo.UnknownLogAppendInfo.firstOffset.getOrElse(-1), RecordBatch.NO_TIMESTAMP, LogAppendInfo.UnknownLogAppendInfo.logStartOffset)
}
// 构造INVALID_REQUIRED_ACKS异常并封装进回调函数调用中
responseCallback(responseStatus)
}
}
appendToLocalLog方法
appendRecords 实现消息写入的方法是 appendToLocalLog
private def appendToLocalLog(internalTopicsAllowed: Boolean,
origin: AppendOrigin,
entriesPerPartition: Map[TopicPartition, MemoryRecords],// 对应分区需要追加的消息数据
requiredAcks: Short): Map[TopicPartition, LogAppendResult] = {
def processFailedRecord(topicPartition: TopicPartition, t: Throwable) = {
val logStartOffset = getPartition(topicPartition) match {
case HostedPartition.Online(partition) => partition.logStartOffset
case HostedPartition.None | HostedPartition.Offline => -1L
}
brokerTopicStats.topicStats(topicPartition.topic).failedProduceRequestRate.mark()
// 如果要写入的主题是内部主题,而internalTopicsAllowed=false,则返回错误
brokerTopicStats.allTopicsStats.failedProduceRequestRate.mark()
error(s"Error processing append operation on partition $topicPartition", t)
logStartOffset
}
trace(s"Append [$entriesPerPartition] to local log")
// 遍历处理每个 topic 分区及其待追加的消息数据
entriesPerPartition.map { case (topicPartition, records) =>
brokerTopicStats.topicStats(topicPartition.topic).totalProduceRequestRate.mark()
brokerTopicStats.allTopicsStats.totalProduceRequestRate.mark()
// reject appending to internal topics if it is not allowed
// 如果追加的对象是内部 topic,依据参数 internalTopicsAllowed 决定是否追加
if (Topic.isInternal(topicPartition.topic) && !internalTopicsAllowed) {
(topicPartition, LogAppendResult(
LogAppendInfo.UnknownLogAppendInfo,
Some(new InvalidTopicException(s"Cannot append to internal topic ${topicPartition.topic}"))))
} else {
try {
// 获取分区对象
val partition = getPartitionOrException(topicPartition, expectLeader = true)
// 向该分区对象写入消息集合
val info = partition.appendRecordsToLeader(records, origin, requiredAcks)
val numAppendedMessages = info.numMessages
// update stats for successfully appended bytes and messages as bytesInRate and messageInRate
brokerTopicStats.topicStats(topicPartition.topic).bytesInRate.mark(records.sizeInBytes)
brokerTopicStats.allTopicsStats.bytesInRate.mark(records.sizeInBytes)
brokerTopicStats.topicStats(topicPartition.topic).messagesInRate.mark(numAppendedMessages)
brokerTopicStats.allTopicsStats.messagesInRate.mark(numAppendedMessages)
trace(s"${records.sizeInBytes} written to log $topicPartition beginning at offset " +
s"${info.firstOffset.getOrElse(-1)} and ending at offset ${info.lastOffset}")
// 返回每个分区写入的消息结果
(topicPartition, LogAppendResult(info))
} catch {
// NOTE: Failed produce requests metric is not incremented for known exceptions
// it is supposed to indicate un-expected failures of a broker in handling a produce request
// 找不到 topic 分区对应的 Partition 对象
case e@ (_: UnknownTopicOrPartitionException |
_: NotLeaderForPartitionException |
_: RecordTooLargeException |
_: RecordBatchTooLargeException |
_: CorruptRecordException |
_: KafkaStorageException) =>
(topicPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, Some(e)))
case rve: RecordValidationException =>
val logStartOffset = processFailedRecord(topicPartition, rve.invalidException)
val recordErrors = rve.recordErrors
(topicPartition, LogAppendResult(LogAppendInfo.unknownLogAppendInfoWithAdditionalInfo(
logStartOffset, recordErrors, rve.invalidException.getMessage), Some(rve.invalidException)))
case t: Throwable =>
val logStartOffset = processFailedRecord(topicPartition, t)
(topicPartition, LogAppendResult(LogAppendInfo.unknownLogAppendInfoWithLogStartOffset(logStartOffset), Some(t)))
}
}
}
}
其中partition.appendRecordsToLeader如下:
def appendRecordsToLeader(records: MemoryRecords, origin: AppendOrigin, requiredAcks: Int): LogAppendInfo = {
val (info, leaderHWIncremented) = inReadLock(leaderIsrUpdateLock) {
leaderLogIfLocal match {
// 只有 leader 副本支持追加消息操作
case Some(leaderLog) =>
val minIsr = leaderLog.config.minInSyncReplicas
val inSyncSize = inSyncReplicaIds.size
// Avoid writing to leader if there are not enough insync replicas to make it safe
if (inSyncSize < minIsr && requiredAcks == -1) {
throw new NotEnoughReplicasException(s"The size of the current ISR $inSyncReplicaIds " +
s"is insufficient to satisfy the min.isr requirement of $minIsr for partition $topicPartition")
}
// 往 leader 副本的 Log 对象中追加消息,本质是调用Log的append函数。
val info = leaderLog.appendAsLeader(records, leaderEpoch = this.leaderEpoch, origin,
interBrokerProtocolVersion)
// we may need to increment high watermark since ISR could be down to 1
// 尝试后移 leader 副本的 HW 值
(info, maybeIncrementLeaderHW(leaderLog))
case None =>
throw new NotLeaderForPartitionException("Leader not local for partition %s on broker %d"
.format(topicPartition, localBrokerId))
}
}
// some delayed operations may be unblocked after HW changed
if (leaderHWIncremented)
// 如果 leader 副本的 HW 值增加了,则尝试执行监听当前 topic 分区的 DelayedFetch 和 DelayedProduce 任务
tryCompleteDelayedRequests()
else {
// probably unblock some follower fetch requests since log end offset has been updated
delayedOperations.checkAndCompleteFetch()
}
info
}
它的appendAsLeader就是Log.scala中的appendAsLeader
delayedProduceRequestRequired 方法
它用于判断消息集合被写入到日志之后,是否需要等待其他副本也写入成功。
private def delayedProduceRequestRequired(requiredAcks: Short,
entriesPerPartition: Map[TopicPartition, MemoryRecords],
localProduceResults: Map[TopicPartition, LogAppendResult]): Boolean = {
requiredAcks == -1 && // requiredAcks 必须等于 -1;
entriesPerPartition.nonEmpty && // 依然有数据尚未写完;
localProduceResults.values.count(_.exception.isDefined) < entriesPerPartition.size // 至少有一个分区的消息已经成功地被写入到本地日志。
// 如果所有分区的数据写入都不成功,就表明可能出现了很严重的错误,此时,比较明智的做法是不再等待,而是直接返回错误给发送方。
// 相反地,如果有部分分区成功写入,而部分分区写入失败了,就表明可能是由偶发的瞬时错误导致的。
}