Kafka 副本管理模块(四):副本写入 appendRecords 方法

        副本写入,是指向副本底层日志写入消息。在 ReplicaManager 类中,实现副本写入的方法叫 appendRecords。放眼整个 Kafka 源码世界,需要副本写入的场景有 4 个。

  • 生产者向 Leader 副本写入消息
  • Follower 副本拉取消息后写入副本
  • 消费者组写入组信息
  • 事务管理器写入事务信息(包括事务标记、事务元数据等)

下面直接看ReplicaManager 类中实现副本写入的方法 appendRecords。

  def appendRecords(timeout: Long, // 请求处理超时时间
                    requiredAcks: Short, // 请求acks设置
                    internalTopicsAllowed: Boolean, // 是否允许写入内部主题
                    origin: AppendOrigin,  // 写入方来源:副本、coordinator、客户端
                    entriesPerPartition: Map[TopicPartition, MemoryRecords], // 待写入消息
                    responseCallback: Map[TopicPartition, PartitionResponse] => Unit, // 回调逻辑
                    delayedProduceLock: Option[Lock] = None, // 专门用来保护消费者组操作线程安全的锁对象,在其他场景中用不到。
                    recordConversionStatsCallback: Map[TopicPartition, RecordConversionStats] => Unit = _ => () // 消息格式转换操作的回调统计逻辑,主要用于统计消息格式转换操作过程中的一些数据指标
                   ): Unit = {
    // requiredAcks合法取值是-1,0,1,否则视为非法
    if (isValidRequiredAcks(requiredAcks)) {
      val sTime = time.milliseconds
      // 调用appendToLocalLog方法写入消息集合到本地日志
      val localProduceResults = appendToLocalLog(internalTopicsAllowed = internalTopicsAllowed,
        origin, entriesPerPartition, requiredAcks)
      debug("Produce to local log in %d ms".format(time.milliseconds - sTime))

      val produceStatus = localProduceResults.map { case (topicPartition, result) =>
        topicPartition ->
                ProducePartitionStatus(
                  // 设置下一条待写入消息的位移值
                  result.info.lastOffset + 1, // required offset
                  // 构建PartitionResponse封装写入结果
                  new PartitionResponse(result.error, result.info.firstOffset.getOrElse(-1), result.info.logAppendTime,
                    result.info.logStartOffset, result.info.recordErrors.asJava, result.info.errorMessage)) // response status
      }
      // 尝试更新消息格式转换的指标数据
      recordConversionStatsCallback(localProduceResults.map { case (k, v) => k -> v.info.recordConversionStats })
      // 需要等待其他副本完成写入
      if (delayedProduceRequestRequired(requiredAcks, entriesPerPartition, localProduceResults)) {
        // create delayed produce operation
        val produceMetadata = ProduceMetadata(requiredAcks, produceStatus)
        // 创建DelayedProduce延时请求对象
        val delayedProduce = new DelayedProduce(timeout, produceMetadata, this, responseCallback, delayedProduceLock)

        // create a list of (topic, partition) pairs to use as keys for this delayed produce operation
        val producerRequestKeys = entriesPerPartition.keys.map(TopicPartitionOperationKey(_)).toSeq

        // try to complete the request immediately, otherwise put it into the purgatory
        // this is because while the delayed produce operation is being created, new
        // requests may arrive and hence make this operation completable.
        // 再一次尝试完成该延时请求
        // 如果暂时无法完成,则将对象放入到相应的Purgatory中等待后续处理
        delayedProducePurgatory.tryCompleteElseWatch(delayedProduce, producerRequestKeys)

      } else {
        // we can respond immediately
        // 无需等待其他副本写入完成,可以立即发送Response
        val produceResponseStatus = produceStatus.map { case (k, status) => k -> status.responseStatus }
        // 调用回调逻辑然后返回即可
        responseCallback(produceResponseStatus)
      }
    } else {
      // 如果requiredAcks值不合法
      // If required.acks is outside accepted range, something is wrong with the client
      // Just return an error and don't handle the request at all
      val responseStatus = entriesPerPartition.map { case (topicPartition, _) =>
        topicPartition -> new PartitionResponse(Errors.INVALID_REQUIRED_ACKS,
          LogAppendInfo.UnknownLogAppendInfo.firstOffset.getOrElse(-1), RecordBatch.NO_TIMESTAMP, LogAppendInfo.UnknownLogAppendInfo.logStartOffset)
      }
      // 构造INVALID_REQUIRED_ACKS异常并封装进回调函数调用中
      responseCallback(responseStatus)
    }
  }

appendToLocalLog方法

        appendRecords 实现消息写入的方法是 appendToLocalLog

  private def appendToLocalLog(internalTopicsAllowed: Boolean,
                               origin: AppendOrigin,
                               entriesPerPartition: Map[TopicPartition, MemoryRecords],// 对应分区需要追加的消息数据
                               requiredAcks: Short): Map[TopicPartition, LogAppendResult] = {

    def processFailedRecord(topicPartition: TopicPartition, t: Throwable) = {
      val logStartOffset = getPartition(topicPartition) match {
        case HostedPartition.Online(partition) => partition.logStartOffset
        case HostedPartition.None | HostedPartition.Offline => -1L
      }
      brokerTopicStats.topicStats(topicPartition.topic).failedProduceRequestRate.mark()
      // 如果要写入的主题是内部主题,而internalTopicsAllowed=false,则返回错误
      brokerTopicStats.allTopicsStats.failedProduceRequestRate.mark()
      error(s"Error processing append operation on partition $topicPartition", t)

      logStartOffset
    }

    trace(s"Append [$entriesPerPartition] to local log")
    // 遍历处理每个 topic 分区及其待追加的消息数据
    entriesPerPartition.map { case (topicPartition, records) =>
      brokerTopicStats.topicStats(topicPartition.topic).totalProduceRequestRate.mark()
      brokerTopicStats.allTopicsStats.totalProduceRequestRate.mark()

      // reject appending to internal topics if it is not allowed
      // 如果追加的对象是内部 topic,依据参数 internalTopicsAllowed 决定是否追加
      if (Topic.isInternal(topicPartition.topic) && !internalTopicsAllowed) {
        (topicPartition, LogAppendResult(
          LogAppendInfo.UnknownLogAppendInfo,
          Some(new InvalidTopicException(s"Cannot append to internal topic ${topicPartition.topic}"))))
      } else {
        try {
          // 获取分区对象
          val partition = getPartitionOrException(topicPartition, expectLeader = true)
          // 向该分区对象写入消息集合
          val info = partition.appendRecordsToLeader(records, origin, requiredAcks)
          val numAppendedMessages = info.numMessages

          // update stats for successfully appended bytes and messages as bytesInRate and messageInRate
          brokerTopicStats.topicStats(topicPartition.topic).bytesInRate.mark(records.sizeInBytes)
          brokerTopicStats.allTopicsStats.bytesInRate.mark(records.sizeInBytes)
          brokerTopicStats.topicStats(topicPartition.topic).messagesInRate.mark(numAppendedMessages)
          brokerTopicStats.allTopicsStats.messagesInRate.mark(numAppendedMessages)

          trace(s"${records.sizeInBytes} written to log $topicPartition beginning at offset " +
            s"${info.firstOffset.getOrElse(-1)} and ending at offset ${info.lastOffset}")
          // 返回每个分区写入的消息结果
          (topicPartition, LogAppendResult(info))
        } catch {
          // NOTE: Failed produce requests metric is not incremented for known exceptions
          // it is supposed to indicate un-expected failures of a broker in handling a produce request
          // 找不到 topic 分区对应的 Partition 对象
          case e@ (_: UnknownTopicOrPartitionException |
                   _: NotLeaderForPartitionException |
                   _: RecordTooLargeException |
                   _: RecordBatchTooLargeException |
                   _: CorruptRecordException |
                   _: KafkaStorageException) =>
            (topicPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, Some(e)))
          case rve: RecordValidationException =>
            val logStartOffset = processFailedRecord(topicPartition, rve.invalidException)
            val recordErrors = rve.recordErrors
            (topicPartition, LogAppendResult(LogAppendInfo.unknownLogAppendInfoWithAdditionalInfo(
              logStartOffset, recordErrors, rve.invalidException.getMessage), Some(rve.invalidException)))
          case t: Throwable =>
            val logStartOffset = processFailedRecord(topicPartition, t)
            (topicPartition, LogAppendResult(LogAppendInfo.unknownLogAppendInfoWithLogStartOffset(logStartOffset), Some(t)))
        }
      }
    }
  }

        其中partition.appendRecordsToLeader如下:

  def appendRecordsToLeader(records: MemoryRecords, origin: AppendOrigin, requiredAcks: Int): LogAppendInfo = {
    val (info, leaderHWIncremented) = inReadLock(leaderIsrUpdateLock) {
      leaderLogIfLocal match {
        // 只有 leader 副本支持追加消息操作
        case Some(leaderLog) =>
          val minIsr = leaderLog.config.minInSyncReplicas
          val inSyncSize = inSyncReplicaIds.size

          // Avoid writing to leader if there are not enough insync replicas to make it safe
          if (inSyncSize < minIsr && requiredAcks == -1) {
            throw new NotEnoughReplicasException(s"The size of the current ISR $inSyncReplicaIds " +
              s"is insufficient to satisfy the min.isr requirement of $minIsr for partition $topicPartition")
          }
          // 往 leader 副本的 Log 对象中追加消息,本质是调用Log的append函数。
          val info = leaderLog.appendAsLeader(records, leaderEpoch = this.leaderEpoch, origin,
            interBrokerProtocolVersion)

          // we may need to increment high watermark since ISR could be down to 1
          // 尝试后移 leader 副本的 HW 值
          (info, maybeIncrementLeaderHW(leaderLog))

        case None =>
          throw new NotLeaderForPartitionException("Leader not local for partition %s on broker %d"
            .format(topicPartition, localBrokerId))
      }
    }

    // some delayed operations may be unblocked after HW changed
    if (leaderHWIncremented)
    // 如果 leader 副本的 HW 值增加了,则尝试执行监听当前 topic 分区的 DelayedFetch 和 DelayedProduce 任务
      tryCompleteDelayedRequests()
    else {
      // probably unblock some follower fetch requests since log end offset has been updated
      delayedOperations.checkAndCompleteFetch()
    }

    info
  }

        它的appendAsLeader就是Log.scala中的appendAsLeader

delayedProduceRequestRequired 方法

        它用于判断消息集合被写入到日志之后,是否需要等待其他副本也写入成功。

  private def delayedProduceRequestRequired(requiredAcks: Short,
                                            entriesPerPartition: Map[TopicPartition, MemoryRecords],
                                            localProduceResults: Map[TopicPartition, LogAppendResult]): Boolean = {
    requiredAcks == -1 && // requiredAcks 必须等于 -1;
    entriesPerPartition.nonEmpty && // 依然有数据尚未写完;
    localProduceResults.values.count(_.exception.isDefined) < entriesPerPartition.size // 至少有一个分区的消息已经成功地被写入到本地日志。
    // 如果所有分区的数据写入都不成功,就表明可能出现了很严重的错误,此时,比较明智的做法是不再等待,而是直接返回错误给发送方。
    // 相反地,如果有部分分区成功写入,而部分分区写入失败了,就表明可能是由偶发的瞬时错误导致的。
  }

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值