KafkaProducer源码分析

最新推荐文章于 2024-02-28 19:49:54 发布

麻布二狗

最新推荐文章于 2024-02-28 19:49:54 发布

阅读量367

点赞数

分类专栏：源码分析文章标签： kafka java

本文链接：https://blog.csdn.net/beichen8641/article/details/110734701

版权

源码分析专栏收录该内容

6 篇文章 1 订阅

订阅专栏

架构图

doSend 方法源码解析

private Future<RecordMetadata> doSend(ProducerRecord<K, V> record, Callback callback) {
        TopicPartition tp = null;
        
            /**
            方法中做的事情是：
            获取主题的集群元数据信息，producer在发送消息会查询本地缓存是否有元数据信息，
            如果没有使用metadata.requestUpdate()标记需要更新元数据，
            当前应用线程进入睡眠等待，sender线程则向远端服务器发起主题元数据询请求， 
            直到远端服务器返回结果唤醒应用线程
            **/
            ClusterAndWaitTime clusterAndWaitTime = waitOnMetadata(record.topic(), record.partition(), maxBlockTimeMs);
            long remainingWaitMs = Math.max(0, maxBlockTimeMs - clusterAndWaitTime.waitedOnMetadataMs);
            Cluster cluster = clusterAndWaitTime.cluster;
            byte[] serializedKey;
            try {
                //使用序列化器对key进行序列化
                serializedKey = keySerializer.serialize(record.topic(), record.headers(), record.key());
            } catch (ClassCastException cce) {
                throw new SerializationException("Can't convert key of class " + record.key().getClass().getName() +
                        " to class " + producerConfig.getClass(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG).getName() +
                        " specified in key.serializer", cce);
            }
            byte[] serializedValue;
            try {
                //使用value序列化器对消息进行序列化
                serializedValue = valueSerializer.serialize(record.topic(), record.headers(), record.value());
            } catch (ClassCastException cce) {
                throw new SerializationException("Can't convert value of class " + record.value().getClass().getName() +
                        " to class " + producerConfig.getClass(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG).getName() +
                        " specified in value.serializer", cce);
            }
            //使用分区器计算消息路由到分区值
            int partition = partition(record, serializedKey, serializedValue, cluster);
            //构造主题分区对象
            tp = new TopicPartition(record.topic(), partition);

            setReadOnly(record.headers());
            Header[] headers = record.headers().toArray();

            int serializedSize = AbstractRecords.estimateSizeInBytesUpperBound(apiVersions.maxUsableProduceMagic(),
                    compressionType, serializedKey, serializedValue, headers);
            ensureValidRecordSize(serializedSize);
            long timestamp = record.timestamp() == null ? time.milliseconds() : record.timestamp();
            log.trace("Sending record {} with callback {} to topic {} partition {}", record, callback, record.topic(), partition);
            // producer callback will make sure to call both 'callback' and interceptor callback
            Callback interceptCallback = new InterceptorCallback<>(callback, this.interceptors, tp);

            if (transactionManager != null && transactionManager.isTransactional())
                transactionManager.maybeAddPartitionToTransaction(tp);
            
            //把消息追加到消息累加器中，消息的发送动作由sender线程处理
            RecordAccumulator.RecordAppendResult result = accumulator.append(tp, timestamp, serializedKey,
                    serializedValue, headers, interceptCallback, remainingWaitMs);
            
            //如果主题对应的消息存储器满了 或者 第一次创建消息存储器则换新Sender线程对刚才追加的消息进行网络IO发送到服务端
            if (result.batchIsFull || result.newBatchCreated) {
                log.trace("Waking up the sender since topic {} partition {} is either full or getting a new batch", record.topic(), partition);
                this.sender.wakeup();
            }
            return result.future;
       
    }

waitOnMetadata 方法解析

private ClusterAndWaitTime waitOnMetadata(String topic, Integer partition, long maxWaitMs) throws InterruptedException {
        //往元数据添加一个主题，如果主题不存在， 把metadata.needUpdate属性置为true
        metadata.add(topic);
        //获取当前的集群信息对象
        Cluster cluster = metadata.fetch();
        //获取主题的分区数量
        Integer partitionsCount = cluster.partitionCountForTopic(topic);
        // Return cached metadata if we have it, and if the record's partition is either undefined
        // or within the known partition range
        //如果主题的分区数量不为空  并且  分区数量大于传参的分区数则直接返回缓存中的集群信息
        if (partitionsCount != null && (partition == null || partition < partitionsCount))
            return new ClusterAndWaitTime(cluster, 0);

        long begin = time.milliseconds();
        long remainingWaitMs = maxWaitMs;
        long elapsed;
        
        do {
            log.trace("Requesting metadata update for topic {}.", topic);
            //往元数据添加一个主题，如果主题不存在， 把metadata.needUpdate属性置为true
            metadata.add(topic);
            //把metadata.needUpdate属性置为true,并得到当前元数据的版本号
            int version = metadata.requestUpdate();
            //唤醒sender线程
            sender.wakeup();
            try {
                /**
                当前线程挂起，等待sender线程向远端服务器发起主题元数据询请求
                直到远端服务器返回新的元数据结果唤醒当前线程
                **/
                metadata.awaitUpdate(version, remainingWaitMs);
            } catch (TimeoutException ex) {
                // Rethrow with original maxWaitMs to prevent logging exception with remainingWaitMs
                throw new TimeoutException("Failed to update metadata after " + maxWaitMs + " ms.");
            }
            //拿到更新后的集群信息
            cluster = metadata.fetch();
            elapsed = time.milliseconds() - begin;
            if (elapsed >= maxWaitMs)
                throw new TimeoutException("Failed to update metadata after " + maxWaitMs + " ms.");
            if (cluster.unauthorizedTopics().contains(topic))
                throw new TopicAuthorizationException(topic);
            remainingWaitMs = maxWaitMs - elapsed;
            partitionsCount = cluster.partitionCountForTopic(topic);
        } while (partitionsCount == null);

        if (partition != null && partition >= partitionsCount) {
            throw new KafkaException(
                    String.format("Invalid partition given with record: %d is not in the range [0...%d).", partition, partitionsCount));
        }
        //走到这里说明已经拿到主题的元数据信息， 返回集群信息即可即可
        return new ClusterAndWaitTime(cluster, elapsed);
    }

RecordAccumulator.append方法解析

public RecordAppendResult append(TopicPartition tp,
                                     long timestamp,
                                     byte[] key,
                                     byte[] value,
                                     Header[] headers,
                                     Callback callback,
                                     long maxTimeToBlock) throws InterruptedException {
        // We keep track of the number of appending thread to make sure we do not miss batches in
        // abortIncompleteBatches().
        //消息累加器并发数加1
        appendsInProgress.incrementAndGet();
        ByteBuffer buffer = null;
        if (headers == null) headers = Record.EMPTY_HEADERS;
        try {
            // check if we have an in-progress batch
            //获取当前主题对应的消息存储双向队列，如果没有则创建
            Deque<ProducerBatch> dq = getOrCreateDeque(tp);
            //针对同一个主题要发送前加锁，锁为双向队列
            synchronized (dq) {
                if (closed)
                    throw new IllegalStateException("Cannot send after the producer is closed.");
                /**
                    尝试从双向队列尾部取出一个ProducerBatch消息存储器，并往ProducerBatch追加消息，
                    如果双向队列为空或者消息存储器内存空间不足消息大小的空间则忽略
                **/
                RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callback, dq);
                //双向队列不为空，则追加消息成功，直接返回追加结果
                if (appendResult != null)
                    return appendResult;
            }
            
            //如果双向队列为空，表示追加消息失败，下面将会创建 ProducerBatch消息存储器
            // we don't have an in-progress record batch try to allocate a new batch
            byte maxUsableMagic = apiVersions.maxUsableProduceMagic();
            //获取要创建的消息存储器的内存大小，如果大于配置的batch.size（默认：16384）的值，则使用消息的字节大小
            int size = Math.max(this.batchSize, AbstractRecords.estimateSizeInBytesUpperBound(maxUsableMagic, compression, key, value, headers));
            log.trace("Allocating a new {} byte message buffer for topic {} partition {}", size, tp.topic(), tp.partition());
            //从内存池创建一个size大小的ByteBuffer，若内存池满了会等待一定时间后，若还未有空闲内存，则会报错
            buffer = free.allocate(size, maxTimeToBlock);
            synchronized (dq) {
                // Need to check if producer is closed again after grabbing the dequeue lock.
                if (closed)
                    throw new IllegalStateException("Cannot send after the producer is closed.");
    
                //再次尝试ProducerBatch追加消息，主要担心并发问题引起重复创建消息存储器
                RecordAppendResult appendResult = tryAppend(timestamp, key, value, headers, callback, dq);
                if (appendResult != null) {
                    // Somebody else found us a batch, return the one we waited for! Hopefully this doesn't happen often...
                    return appendResult;
                }

                
                /**
                构造MemoryRecordsBuilder，消息最终存放的地址,请注意，消息的写入是从ByteBuffer内的byte数组的
                第61位开始写入的，前61位是用来写入消息存储器RecordBatch的全局变量例如：recordsCount、producerID，
                内部使用ByteBuffer.position(61)避免消息写入前61位
                
                **/
                MemoryRecordsBuilder recordsBuilder = recordsBuilder(buffer, maxUsableMagic);
                //创建消息存储器
                ProducerBatch batch = new ProducerBatch(tp, recordsBuilder, time.milliseconds());
                //使用batch.tryAppend追加消息
                FutureRecordMetadata future = Utils.notNull(batch.tryAppend(timestamp, key, value, headers, callback, time.milliseconds()));
                //往双向队列尾部追加消息存储器
                dq.addLast(batch);
                incomplete.add(batch);
                
                //到这里证明创建消息存储器成功，buffer置空，防止被内存池回收
                // Don't deallocate this buffer in the finally block as it's being used in the record batch
                buffer = null;

                return new RecordAppendResult(future, dq.size() > 1 || batch.isFull(), true);
            }
        } finally {
            //如果buffer不为空，回收buffer到内存池
            if (buffer != null)
                free.deallocate(buffer);
            appendsInProgress.decrementAndGet();
        }
    }

batch.tryAppend 解析


public FutureRecordMetadata tryAppend(long timestamp, byte[] key, byte[] value, Header[] headers, Callback callback, long now) {
        //这里检查消息存储器的内存是否有消息大小的空间，空间不足则返回空，再上面的代码片段最下面创建新的消息存储器
        if (!recordsBuilder.hasRoomFor(timestamp, key, value, headers)) {
            return null;
        } else {
            //开始往recordsBuilder里的ByeBuffer追加消息
            Long checksum = this.recordsBuilder.append(timestamp, key, value, headers);
            //更新最大消息的字节数
            this.maxRecordSize = Math.max(this.maxRecordSize, AbstractRecords.estimateSizeInBytesUpperBound(magic(),
                    recordsBuilder.compressionType(), key, value, headers));
            this.lastAppendTime = now;
            //获取Future对象，让用户线程可以阻塞等待消息发送的结果
            FutureRecordMetadata future = new FutureRecordMetadata(this.produceFuture, this.recordCount,
                                                                   timestamp, checksum,
                                                                   key == null ? -1 : key.length,
                                                                   value == null ? -1 : value.length);
            // we have to keep every future returned to the users in case the batch needs to be
            // split to several new batches and resent.
            thunks.add(new Thunk(callback, future));
            this.recordCount++;
            return future;
        }
    }

recordsBuilder.append解析

该方法最终调用的方法为MemoryRecordsBuilder.appendDefaultRecord


private void appendDefaultRecord(long offset, long timestamp, ByteBuffer key, ByteBuffer value,
                                     Header[] headers) throws IOException {
        //检查消息存储器是否可写
        ensureOpenForRecordAppend();
        //消息存储器当前写入的消息的偏移量
        int offsetDelta = (int) (offset - baseOffset);
        //当前写入消息与该消息存储器第一次写入消息的时间差
        long timestampDelta = timestamp - firstTimestamp;
        //开始写入消息进入ByteBuffer
        int sizeInBytes = DefaultRecord.writeTo(appendStream, offsetDelta, timestampDelta, key, value, headers);
        //重新计算ProductBatch的消息总数、最大偏移量、已追加的消息字节大小
        recordWritten(offset, timestamp, sizeInBytes);
}

DefaultRecord.writeTo 解析

消息格式如下,会依次写入以下字段

字段含义：

length ：消息总长度。

attributes ：弃用，但还是在消息格式中占据 1B 的大小，以备未来的格式扩展。

timestamp delta ：时间戳增量。通常一个 time stamp 需要占用 8 个字节，如果像
这里一样保存与 RecordBatch 的起始时间戳的差值，则可以进一步节省占用的字节数。

offset delta ：位移增量。保存与 RecordBatch 起始位移的差值，可以节省占用的字节数。

headers ：这个字段用来支持应用级别的扩展，而不需要像 v0 和 v1 版本一样不得不
将一些应用级别的属性值嵌入消息体。 Header 的格式如图最右部分所示，包含 key
和 value ，一个 Record 里面可以包含 0 至多个 Header。


public static int writeTo(DataOutputStream out,
                              int offsetDelta,
                              long timestampDelta,
                              ByteBuffer key,
                              ByteBuffer value,
                              Header[] headers) throws IOException {
        //计算消息大小                    
        int sizeInBytes = sizeOfBodyInBytes(offsetDelta, timestampDelta, key, value, headers);
        //写入消息大小
        ByteUtils.writeVarint(sizeInBytes, out);
        //写入attributes
        byte attributes = 0; // there are no used record attributes at the moment
        out.write(attributes);

        //写入时间戳增量
        ByteUtils.writeVarlong(timestampDelta, out);
        //写入offset delta
        ByteUtils.writeVarint(offsetDelta, out);
        
        //写入key长度和key的值
        if (key == null) {
            ByteUtils.writeVarint(-1, out);
        } else {
            int keySize = key.remaining();
            ByteUtils.writeVarint(keySize, out);
            Utils.writeTo(out, key, keySize);
        }

        //写入value长度和value的值
        if (value == null) {
            ByteUtils.writeVarint(-1, out);
        } else {
            int valueSize = value.remaining();
            ByteUtils.writeVarint(valueSize, out);
            Utils.writeTo(out, value, valueSize);
        }
        
        if (headers == null)
            throw new IllegalArgumentException("Headers cannot be null");
        //写入header数量
        ByteUtils.writeVarint(headers.length, out);

        //遍历写入header
        for (Header header : headers) {
            String headerKey = header.key();
            if (headerKey == null)
                throw new IllegalArgumentException("Invalid null header key found in headers");

            byte[] utf8Bytes = Utils.utf8(headerKey);
            ByteUtils.writeVarint(utf8Bytes.length, out);
            out.write(utf8Bytes);

            byte[] headerValue = header.value();
            if (headerValue == null) {
                ByteUtils.writeVarint(-1, out);
            } else {
                ByteUtils.writeVarint(headerValue.length, out);
                out.write(headerValue);
            }
        }

        return ByteUtils.sizeOfVarint(sizeInBytes) + sizeInBytes;
    }

到这里用户线程已经把消息追加到消息累加器中就结束了，剩下就就是消息通过网络IO发送到服务端的工作由另外一个Sender线程处理

Sender发送消息网络IO的入口


void run(long now) {
        if (transactionManager != null) {
            try {
                if (transactionManager.shouldResetProducerStateAfterResolvingSequences())
                    // Check if the previous run expired batches which requires a reset of the producer state.
                    transactionManager.resetProducerId();

                if (!transactionManager.isTransactional()) {
                    // this is an idempotent producer, so make sure we have a producer id
                    maybeWaitForProducerId();
                } else if (transactionManager.hasUnresolvedSequences() && !transactionManager.hasFatalError()) {
                    transactionManager.transitionToFatalError(new KafkaException("The client hasn't received acknowledgment for " +
                            "some previously sent messages and can no longer retry them. It isn't safe to continue."));
                } else if (transactionManager.hasInFlightTransactionalRequest() || maybeSendTransactionalRequest(now)) {
                    // as long as there are outstanding transactional requests, we simply wait for them to return
                    client.poll(retryBackoffMs, now);
                    return;
                }

                // do not continue sending if the transaction manager is in a failed state or if there
                // is no producer id (for the idempotent case).
                if (transactionManager.hasFatalError() || !transactionManager.hasProducerId()) {
                    RuntimeException lastError = transactionManager.lastError();
                    if (lastError != null)
                        maybeAbortBatches(lastError);
                    client.poll(retryBackoffMs, now);
                    return;
                } else if (transactionManager.hasAbortableError()) {
                    accumulator.abortUndrainedBatches(transactionManager.lastError());
                }
            } catch (AuthenticationException e) {
                // This is already logged as error, but propagated here to perform any clean ups.
                log.trace("Authentication exception while processing transactional request: {}", e);
                transactionManager.authenticationFailed(e);
            }
        }
        
        //重点关注该方法，该方法是构造发送请求
        long pollTimeout = sendProducerData(now);
        //该方法是做网络IO操作， 把上面构造的发送请求发送到远程服务端
        client.poll(pollTimeout, now);
    }

sendProducerData方法解析


private long sendProducerData(long now) {
        Cluster cluster = metadata.fetch();

        
        //获取要发送消息主题对应的网络连接成功的broker节点信息
        RecordAccumulator.ReadyCheckResult result = this.accumulator.ready(cluster, now);

        //如果出现主题对应的broker节点找不到的情况，做元数据更新的补偿
        if (!result.unknownLeaderTopics.isEmpty()) {
            // The set of topics with unknown leader contains topics with leader election pending as well as
            // topics which may have expired. Add the topic again to metadata to ensure it is included
            // and request metadata update, since there are messages to send to the topic.
            for (String topic : result.unknownLeaderTopics)
                this.metadata.add(topic);
            this.metadata.requestUpdate();
        }

        // 开始遍历准备好的broker节点列表
        Iterator<Node> iter = result.readyNodes.iterator();
        long notReadyTimeout = Long.MAX_VALUE;
        while (iter.hasNext()) {
            Node node = iter.next();
            //再次检查broker节点的健康状态
            if (!this.client.ready(node, now)) {
                iter.remove();
                notReadyTimeout = Math.min(notReadyTimeout, this.client.connectionDelay(node, now));
            }
        }
        
      
        /**
        由于ProducerBatch是按照partiton来分组的，但是Sender实际发送网络IO是根据broker节点进行分组的 ，
        所以需要把ProducerBatch的分组信息改成partition对应的leader节点进行分组，并且只取每个
        partition的第一个RecordBatch进行组装，而不是将一个分区的所有BatchRecord都发送，
        可以理解为Map<broker节点Id,List<每个分区的第一个ProducerBatch>>
        
        **/
        Map<Integer, List<ProducerBatch>> batches = this.accumulator.drain(cluster, result.readyNodes,
                this.maxRequestSize, now);
        if (guaranteeMessageOrder) {
            // Mute all the partitions drained
            for (List<ProducerBatch> batchList : batches.values()) {
                for (ProducerBatch batch : batchList)
                    this.accumulator.mutePartition(batch.topicPartition);
            }
        }
        
        
        List<ProducerBatch> expiredBatches = this.accumulator.expiredBatches(this.requestTimeout, now);
        // Reset the producer id if an expired batch has previously been sent to the broker. Also update the metrics
        // for expired batches. see the documentation of @TransactionState.resetProducerId to understand why
        // we need to reset the producer id here.
        if (!expiredBatches.isEmpty())
            log.trace("Expired {} batches in accumulator", expiredBatches.size());
        for (ProducerBatch expiredBatch : expiredBatches) {
            failBatch(expiredBatch, -1, NO_TIMESTAMP, expiredBatch.timeoutException(), false);
            if (transactionManager != null && expiredBatch.inRetry()) {
                // This ensures that no new batches are drained until the current in flight batches are fully resolved.
                transactionManager.markSequenceUnresolved(expiredBatch.topicPartition);
            }
        }

        sensors.updateProduceRequestMetrics(batches);

        // If we have any nodes that are ready to send + have sendable data, poll with 0 timeout so this can immediately
        // loop and try sending more data. Otherwise, the timeout is determined by nodes that have partitions with data
        // that isn't yet sendable (e.g. lingering, backing off). Note that this specifically does not include nodes
        // with sendable data that aren't ready to send since they would cause busy looping.
        long pollTimeout = Math.min(result.nextReadyCheckDelayMs, notReadyTimeout);
        if (!result.readyNodes.isEmpty()) {
            log.trace("Nodes with data ready to send: {}", result.readyNodes);
            // if some partitions are already ready to be sent, the select time would be 0;
            // otherwise if some partition already has some data accumulated but not ready yet,
            // the select time will be the time difference between now and its linger expiry time;
            // otherwise the select time will be the time difference between now and the metadata expiry time;
            pollTimeout = 0;
        }
        //构造发送消息请求
        sendProduceRequests(batches, now);

        return pollTimeout;
    }

accumulator.drain方法解析


public Map<Integer, List<ProducerBatch>> drain(Cluster cluster,
                                                   Set<Node> nodes,
                                                   int maxSize,
                                                   long now) {
        if (nodes.isEmpty())
            return Collections.emptyMap();
        //要返回的节点对应的ProducerBatch数据
        Map<Integer, List<ProducerBatch>> batches = new HashMap<>();
        //遍历要发送消息的节点信息
        for (Node node : nodes) {
            int size = 0;
            //查询节点对应要发送消息的leader节点列表
            List<PartitionInfo> parts = cluster.partitionsForNode(node.id());
            List<ProducerBatch> ready = new ArrayList<>();
            /* to make starvation less likely this loop doesn't start at 0 */
            //可以理解为常用的循环 for(i=0)
            int start = drainIndex = drainIndex % parts.size();
            do {
                //获取遍历的的主题信息
                PartitionInfo part = parts.get(drainIndex);
                TopicPartition tp = new TopicPartition(part.topic(), part.partition());
                // Only proceed if the partition has no in-flight batches.
                if (!muted.contains(tp)) {
                    //获取主题对应的消息存储双向队列
                    Deque<ProducerBatch> deque = getDeque(tp);
                    if (deque != null) {
                        //对双向队列加锁，防止此时有线程追加消息
                        synchronized (deque) {
                            //获取双向队列第一个ProducerBatch，注意这里并没有在双向队列中删除第一个ProducerBatch
                            ProducerBatch first = deque.peekFirst();
                            if (first != null) {
                                //检查消息是否是重新发送消息， 如果是的话检查重新发送的时间是否已到
                                boolean backoff = first.attempts() > 0 && first.waitedTimeMs(now) < retryBackoffMs;
                                // Only drain the batch if it is not during backoff period.
                                //如果是新的消息，或者到达重试发送的条件，则把消息追加到ready属性中
                                if (!backoff) {
                                    //检查累加的消息字节大小是否大于配置项max.request.size（默认为：1048576字节 1M），大于则跳出循环， 等待下一次网络发送
                                    if (size + first.estimatedSizeInBytes() > maxSize && !ready.isEmpty()) {
                                        // there is a rare case that a single batch size is larger than the request size due
                                        // to compression; in this case we will still eventually send this batch in a single
                                        // request
                                        break;
                                    } else {
                                        ProducerIdAndEpoch producerIdAndEpoch = null;
                                        boolean isTransactional = false;
                                        if (transactionManager != null) {
                                            if (!transactionManager.isSendToPartitionAllowed(tp))
                                                break;

                                            producerIdAndEpoch = transactionManager.producerIdAndEpoch();
                                            if (!producerIdAndEpoch.isValid())
                                                // we cannot send the batch until we have refreshed the producer id
                                                break;

                                            isTransactional = transactionManager.isTransactional();

                                            if (!first.hasSequence() && transactionManager.hasUnresolvedSequence(first.topicPartition))
                                                // Don't drain any new batches while the state of previous sequence numbers
                                                // is unknown. The previous batches would be unknown if they were aborted
                                                // on the client after being sent to the broker at least once.
                                                break;

                                            int firstInFlightSequence = transactionManager.firstInFlightSequence(first.topicPartition);
                                            if (firstInFlightSequence != RecordBatch.NO_SEQUENCE && first.hasSequence()
                                                    && first.baseSequence() != firstInFlightSequence)
                                                // If the queued batch already has an assigned sequence, then it is being
                                                // retried. In this case, we wait until the next immediate batch is ready
                                                // and drain that. We only move on when the next in line batch is complete (either successfully
                                                // or due to a fatal broker error). This effectively reduces our
                                                // in flight request count to 1.
                                                break;
                                        }

                                        //从双向队列中弹出第一个ProducerBatch，并删除第一个ProducerBatch
                                        ProducerBatch batch = deque.pollFirst();
                                        if (producerIdAndEpoch != null && !batch.hasSequence()) {
                                            // If the batch already has an assigned sequence, then we should not change the producer id and
                                            // sequence number, since this may introduce duplicates. In particular,
                                            // the previous attempt may actually have been accepted, and if we change
                                            // the producer id and sequence here, this attempt will also be accepted,
                                            // causing a duplicate.
                                            //
                                            // Additionally, we update the next sequence number bound for the partition,
                                            // and also have the transaction manager track the batch so as to ensure
                                            // that sequence ordering is maintained even if we receive out of order
                                            // responses.
                                            batch.setProducerState(producerIdAndEpoch, transactionManager.sequenceNumber(batch.topicPartition), isTransactional);
                                            transactionManager.incrementSequenceNumber(batch.topicPartition, batch.recordCount);
                                            log.debug("Assigned producerId {} and producerEpoch {} to batch with base sequence " +
                                                            "{} being sent to partition {}", producerIdAndEpoch.producerId,
                                                    producerIdAndEpoch.epoch, batch.baseSequence(), tp);

                                            transactionManager.addInFlightBatch(batch);
                                        }
                                       
                                       /**
                                       重点关注该方法，对RecordBatch进行封箱操作，在RecordBatch的ByteBuffer的前61位
                                       写入消息的总长度大小、消息数量、生产者id等
                                       **/
                                        batch.close();
                                        size += batch.records().sizeInBytes();
                                        ready.add(batch);
                                        batch.drained(now);
                                    }
                                }
                            }
                        }
                    }
                }
                this.drainIndex = (this.drainIndex + 1) % parts.size();
            } while (start != drainIndex);
            batches.put(node.id(), ready);
        }
        return batches;
    }

batch.close解析

该方法主要是对ProducerBatch的封箱操作，写入头部字段，头部字段结构图如下：

重要字段含义：

字段	注释
first offset	从buff的第几位开始读取
length	消息字节总大小，计算从partiton leader epoch字段开始到末尾的长度partiton leader epoch
magic	消息格式的版本号
attributes	消息属性
last offset delta	RecordBatch 中最后一个 Record 的 offset 与自rst offset 的差值
first timestamp	RecordBatch 中第一条 Record 的时间戳。
max timestamp	RecordBatch 中最大的时间戳，一般情况下是指最后一个 Record的时间戳
produeer id	PID，用来支持军等和事务
produeer epoch	用来支持幕等和事务
first sequence	和 produee r id、 producer epoeh 一样，用来支持幕等和事务
records count	RecordBatch 中 Record 的个数



public void close() {
        //关注该方法
        recordsBuilder.close();
        if (!recordsBuilder.isControlBatch()) {
            CompressionRatioEstimator.updateEstimation(topicPartition.topic(),
                                                       recordsBuilder.compressionType(),
                                                       (float) recordsBuilder.compressionRatio());
        }
        reopened = false;
}
    
    
public void close() {
        if (aborted)
            throw new IllegalStateException("Cannot close MemoryRecordsBuilder as it has already been aborted");
        
        if (builtRecords != null)
            return;
        
        //校验生产者ID
        validateProducerState();
        
        //关闭Buffer写入消息
        closeForRecordAppends();
        
        //如果写入的消息数量是0则忽略
        if (numRecords == 0L) {
            buffer().position(initialPosition);
            builtRecords = MemoryRecords.EMPTY;
        } else {
            //0.11的kafka版本号是大于1的，走下面这个判断条件
            if (magic > RecordBatch.MAGIC_VALUE_V1)
                //writeDefaultBatchHeader 开始写入头部字段
                this.actualCompressionRatio = (float) writeDefaultBatchHeader() / this.uncompressedRecordsSizeInBytes;
            else if (compressionType != CompressionType.NONE)
                this.actualCompressionRatio = (float) writeLegacyCompressedWrapperHeader() / this.uncompressedRecordsSizeInBytes;

            ByteBuffer buffer = buffer().duplicate();
            buffer.flip();
            buffer.position(initialPosition);
            builtRecords = MemoryRecords.readableRecords(buffer.slice());
        }
}   

private int writeDefaultBatchHeader() {
        //检查当前buffer是否已关闭
        ensureOpenForRecordBatchWrite();
        ByteBuffer buffer = bufferStream.buffer();
        //获取buffer已经写入的的最后一个位置
        int pos = buffer.position();
        //初始化buffer写入位置0，因为头部信息是从第0位开始写入的
        buffer.position(initialPosition);
        //获取整个消息的字节大小
        int size = pos - initialPosition;
        //使用字节大小减去头部字节的大小(61)得到业务消息的实际字节大小
        int writtenCompressed = size - DefaultRecordBatch.RECORD_BATCH_OVERHEAD;
        //得到偏移量
        int offsetDelta = (int) (lastOffset - baseOffset);
        //获取最大的时间戳    
        final long maxTimestamp;
        if (timestampType == TimestampType.LOG_APPEND_TIME)
            maxTimestamp = logAppendTime;
        else
            maxTimestamp = this.maxTimestamp;
        
        //开始真正写入消息头
        DefaultRecordBatch.writeHeader(buffer, baseOffset, offsetDelta, size, magic, compressionType, timestampType,
                firstTimestamp, maxTimestamp, producerId, producerEpoch, baseSequence, isTransactional, isControlBatch,
                partitionLeaderEpoch, numRecords);
        //恢复buffer原来的写入位置
        buffer.position(pos);
        return writtenCompressed;
}


static void writeHeader(ByteBuffer buffer,
                            long baseOffset,
                            int lastOffsetDelta,
                            int sizeInBytes,
                            byte magic,
                            CompressionType compressionType,
                            TimestampType timestampType,
                            long firstTimestamp,
                            long maxTimestamp,
                            long producerId,
                            short epoch,
                            int sequence,
                            boolean isTransactional,
                            boolean isControlBatch,
                            int partitionLeaderEpoch,
                            int numRecords) {
        if (magic < RecordBatch.CURRENT_MAGIC_VALUE)
            throw new IllegalArgumentException("Invalid magic value " + magic);
        if (firstTimestamp < 0 && firstTimestamp != NO_TIMESTAMP)
            throw new IllegalArgumentException("Invalid message timestamp " + firstTimestamp);

        short attributes = computeAttributes(compressionType, timestampType, isTransactional, isControlBatch);
        
        int position = buffer.position();
        //写入first offset
        buffer.putLong(position + BASE_OFFSET_OFFSET, baseOffset);
        //写入length
        buffer.putInt(position + LENGTH_OFFSET, sizeInBytes - LOG_OVERHEAD);
        //写入partition leader epoch
        buffer.putInt(position + PARTITION_LEADER_EPOCH_OFFSET, partitionLeaderEpoch);
        //写入magic
        buffer.put(position + MAGIC_OFFSET, magic);
        //写入attributes
        buffer.putShort(position + ATTRIBUTES_OFFSET, attributes);
        //写入first timestamp
        buffer.putLong(position + FIRST_TIMESTAMP_OFFSET, firstTimestamp);
        //写入max timestamp
        buffer.putLong(position + MAX_TIMESTAMP_OFFSET, maxTimestamp);
        //写入last offset delta
        buffer.putInt(position + LAST_OFFSET_DELTA_OFFSET, lastOffsetDelta);
        //写入producer id
        buffer.putLong(position + PRODUCER_ID_OFFSET, producerId);
        //写入partition  epoch
        buffer.putShort(position + PRODUCER_EPOCH_OFFSET, epoch);
        //写入first sequence
        buffer.putInt(position + BASE_SEQUENCE_OFFSET, sequence);
        //写入records count
        buffer.putInt(position + RECORDS_COUNT_OFFSET, numRecords);
        long crc = Crc32C.compute(buffer, ATTRIBUTES_OFFSET, sizeInBytes - ATTRIBUTES_OFFSET);
        //写入crc32
        buffer.putInt(position + CRC_OFFSET, (int) crc);
        buffer.position(position + RECORD_BATCH_OVERHEAD);
    }

sendProduceRequests解析

该方法是在sendProducerData内进行调用的，主要逻辑是构造Request请求并为KakfaChannel放置要发送的ByteBuffer和注册写入事件


private void sendProduceRequests(Map<Integer, List<ProducerBatch>> collated, long now) {
        //遍历map，key为brokerId  value为要发送的消息列表,依次为每个broker构造消息发送请求
        for (Map.Entry<Integer, List<ProducerBatch>> entry : collated.entrySet())
            sendProduceRequest(now, entry.getKey(), acks, requestTimeout, entry.getValue());
}
    
    
private void sendProduceRequest(long now, int destination, short acks, int timeout, List<ProducerBatch> batches) {
        if (batches.isEmpty())
            return;
        
        //按主题对消息字节承载器MemoryRecords进行分组
        Map<TopicPartition, MemoryRecords> produceRecordsByPartition = new HashMap<>(batches.size());
        //按主题对ProducerBatch进行分组
        final Map<TopicPartition, ProducerBatch> recordsByPartition = new HashMap<>(batches.size());

        // find the minimum magic version used when creating the record sets
        byte minUsedMagic = apiVersions.maxUsableProduceMagic();
        for (ProducerBatch batch : batches) {
            if (batch.magic() < minUsedMagic)
                minUsedMagic = batch.magic();
        }
        batches
        //遍历batches
        for (ProducerBatch batch : batches) {
            //得到对应的主题
            TopicPartition tp = batch.topicPartition;
            //得到对应的字节承载器MemoryRecords
            MemoryRecords records = batch.records();

            
            if (!records.hasMatchingMagic(minUsedMagic))
                records = batch.records().downConvert(minUsedMagic, 0, time).records();
                
            //存入上面构造的2个Map
            produceRecordsByPartition.put(tp, records);
            recordsByPartition.put(tp, batch);
        }

        String transactionalId = null;
        if (transactionManager != null && transactionManager.isTransactional()) {
            transactionalId = transactionManager.transactionalId();
        }
        
        //根据版本号构造ProductRequest
        ProduceRequest.Builder requestBuilder = ProduceRequest.Builder.forMagic(minUsedMagic, acks, timeout,
                produceRecordsByPartition, transactionalId);
        RequestCompletionHandler callback = new RequestCompletionHandler() {
            public void onComplete(ClientResponse response) {
                handleProduceResponse(response, recordsByPartition, time.milliseconds());
            }
        };
        
        //得到要发送brokerId
        String nodeId = Integer.toString(destination);
        //构造发送请求
        ClientRequest clientRequest = client.newClientRequest(nodeId, requestBuilder, now, acks != 0, callback);
        
        /**
        
        该方法主要做了以下几件事：
        1、requestBuilder.build()生成 ProduceRequest
        2、使用ProduceRequest.toSend() 生成NetworkSend，NetworkSend的构造参数为
        （nodeId，ByteBuffer），ByteBuffer的入参使用ProduceRequest.toStruct()生成消息结构体Struct，构造一个ByteBuffer并调用Struct.writeTo，把内容写进ByteBuffer中
        3、把生成的NetworkSend作为入参构造InFlightRequest，并追加到NetworkClient的inFlightRequests对用的nodeId的队列栈顶中
        4、找到nodeId对应的KafkaChannel，并对Channel注册写入事件
        **/
        client.send(clientRequest, now);
        log.trace("Sent produce request to {}: {}", nodeId, requestBuilder);
    }
    


private void doSend(ClientRequest clientRequest, boolean isInternalRequest, long now) {
        //获取要发送的brokerId
        String nodeId = clientRequest.destination();
        if (!isInternalRequest) {
           
            if (!canSendRequest(nodeId))
                throw new IllegalStateException("Attempt to send a request to node " + nodeId + " which is not ready.");
        }
        //得到RequestBuilder
        AbstractRequest.Builder<?> builder = clientRequest.requestBuilder();
        try {
            NodeApiVersions versionInfo = apiVersions.get(nodeId);
            short version;
           
            if (versionInfo == null) {
                version = builder.latestAllowedVersion();
                if (discoverBrokerVersions && log.isTraceEnabled())
                    log.trace("No version information found when sending {} with correlation id {} to node {}. " +
                            "Assuming version {}.", clientRequest.apiKey(), clientRequest.correlationId(), nodeId, version);
            } else {
                version = versionInfo.latestUsableVersion(clientRequest.apiKey(), builder.oldestAllowedVersion(),
                        builder.latestAllowedVersion());
            }
           
            //使用builder.build生成真正的ProduceRequest，并做发送前的最后一步准备，
            doSend(clientRequest, isInternalRequest, now, builder.build(version));
        } catch (UnsupportedVersionException e) {
            
            log.debug("Version mismatch when attempting to send {} with correlation id {} to {}", builder,
                    clientRequest.correlationId(), clientRequest.destination(), e);
            ClientResponse clientResponse = new ClientResponse(clientRequest.makeHeader(builder.latestAllowedVersion()),
                    clientRequest.callback(), clientRequest.destination(), now, now,
                    false, e, null);
            abortedSends.add(clientResponse);
        }
}



private void doSend(ClientRequest clientRequest, boolean isInternalRequest, long now, AbstractRequest request) {
		//要发送的brokerId
        String nodeId = clientRequest.destination();
        /**
            构造请求头，请求头的作用：
                拥有请求的ApiKey、版本号等，在读取响应数据后，根据请求的ApiKey构造对应的响应实体对象
                例如 ApiKeys.PRODUCE
        **/
        RequestHeader header = clientRequest.makeHeader(request.version());
        if (log.isDebugEnabled()) {
            int latestClientVersion = clientRequest.apiKey().latestVersion();
            if (header.apiVersion() == latestClientVersion) {
                log.trace("Sending {} {} with correlation id {} to node {}", clientRequest.apiKey(), request,
                        clientRequest.correlationId(), nodeId);
            } else {
                log.debug("Using older server API v{} to send {} {} with correlation id {} to node {}",
                        header.apiVersion(), clientRequest.apiKey(), request, clientRequest.correlationId(), nodeId);
            }
        }
		
		/**
		生成NetworkSend，NetworkSend的构造参数为NetworkSend(odeId，ByteBuffer) ，ByteBuffer入参使用ProduceRequest.toStruct()生成消息结构体Struct，
		并构造一个ByteBuffer委派给Struct.writeTo 把内容写进ByteBuffer中
		**/
        Send send = request.toSend(nodeId, header);
		/**
		把生成的NetworkSend作为入参构造InFlightRequest
		**/
        InFlightRequest inFlightRequest = new InFlightRequest(
                header,
                clientRequest.createdTimeMs(),
                clientRequest.destination(),
                clientRequest.callback(),
                clientRequest.expectResponse(),
                isInternalRequest,
                request,
                send,
                now);
		/**
		并追加到NetworkClient的 inFlightRequests 中, inFlightRequests里维护了一个根据brokerId分组的双向队列
		Map<String, Deque<NetworkClient.InFlightRequest>>,请求会追加双向队列的栈顶
		**/
        this.inFlightRequests.add(inFlightRequest);
		//找到nodeId对应的KafkaChannel，并覆盖里面的send属性且为Channel注册写入事件,这个很重要！，注册写入事件后Selector才会把消息真正的发送出去
        selector.send(inFlightRequest.send);
}

网络IO发送

回到Sender发送消息网络IO的入口，在构造完请求后，会调用 client.poll(pollTimeout, now)，该方法就是真正做网络IO处理


public List<ClientResponse> poll(long timeout, long now) {
        if (!abortedSends.isEmpty()) {
            // If there are aborted sends because of unsupported version exceptions or disconnects,
            // handle them immediately without waiting for Selector#poll.
            List<ClientResponse> responses = new ArrayList<>();
            handleAbortedSends(responses);
            completeResponses(responses);
            return responses;
        }
        
        //这里做元数据更新请求的构建
        long metadataTimeout = metadataUpdater.maybeUpdate(now);
        try {
            //网络IO方法，重点关注该方法
            this.selector.poll(Utils.min(timeout, metadataTimeout, requestTimeoutMs));
        } catch (IOException e) {
            log.error("Unexpected error during I/O", e);
        }

        // process completed actions
        long updatedNow = this.time.milliseconds();
        List<ClientResponse> responses = new ArrayList<>();
        //处理已发送成功的请求
        handleCompletedSends(responses, updatedNow);
        //处理已读取成功的响应数据
        handleCompletedReceives(responses, updatedNow);
        handleDisconnections(responses, updatedNow);
        handleConnections();
        handleInitiateApiVersionRequests(updatedNow);
        handleTimedOutRequests(responses, updatedNow);
        completeResponses(responses);

        return responses;
    }

poll方法解析


public void poll(long timeout) throws IOException {
        if (timeout < 0)
            throw new IllegalArgumentException("timeout should be >= 0");

        boolean madeReadProgressLastCall = madeReadProgressLastPoll;
        clear();

        boolean dataInBuffers = !keysWithBufferedRead.isEmpty();

        if (hasStagedReceives() || !immediatelyConnectedKeys.isEmpty() || (madeReadProgressLastCall && dataInBuffers))
            timeout = 0;

        if (!memoryPool.isOutOfMemory() && outOfMemory) {
            //we have recovered from memory pressure. unmute any channel not explicitly muted for other reasons
            log.trace("Broker no longer low on memory - unmuting incoming sockets");
            for (KafkaChannel channel : channels.values()) {
                if (channel.isInMutableState() && !explicitlyMutedChannels.contains(channel)) {
                    channel.unmute();
                }
            }
            outOfMemory = false;
        }
        long startSelect = time.nanoseconds();
		//阻塞等待ready好的Channel，等待超时时间为timeout
        int numReadyKeys = select(timeout);
        long endSelect = time.nanoseconds();
        this.sensors.selectTime.record(endSelect - startSelect, time.milliseconds());
		
		//当有channel读写事件
        if (numReadyKeys > 0 || !immediatelyConnectedKeys.isEmpty() || dataInBuffers) {
			//返回已就绪通道的集合
            Set<SelectionKey> readyKeys = this.nioSelector.selectedKeys();

            // 对历史未处理完的的通道进行二次处理，例如在读取最新的通道数据时，因为内存不够， 把通道缓存在keysWithBufferedRead中，以备下次轮询
            if (dataInBuffers) {
                keysWithBufferedRead.removeAll(readyKeys); //so no channel gets polled twice
                Set<SelectionKey> toPoll = keysWithBufferedRead;
                keysWithBufferedRead = new HashSet<>(); //poll() calls will repopulate if needed
                pollSelectionKeys(toPoll, false, endSelect);
            }

            // 从已就绪的通道中读写数据，重点关注该方法
            pollSelectionKeys(readyKeys, false, endSelect);
			// 清除就绪的通道，防止在下次轮询中重复处理
            readyKeys.clear();
			
            pollSelectionKeys(immediatelyConnectedKeys, true, endSelect);
            immediatelyConnectedKeys.clear();
        } else {
            madeReadProgressLastPoll = true; //no work is also "progress"
        }

        long endIo = time.nanoseconds();
        this.sensors.ioTime.record(endIo - endSelect, time.milliseconds());

        // we use the time at the end of select to ensure that we don't close any connections that
        // have just been processed in pollSelectionKeys
        maybeCloseOldestConnection(endSelect);

        //遍历待处理响应数据双向队列，把待处理响应数据追加到 [读取成功的响应数据队列 completedReceives]尾部
        addToCompletedReceives();
    }

pollSelectionKeys方法解析

void pollSelectionKeys(Set<SelectionKey> selectionKeys,
                           boolean isImmediatelyConnected,
                           long currentTimeNanos) {
		/**
		每次调用时，selectionKeys上的迭代顺序可能相同。
		当内存不足时，这可能会导致读取不足。为了解决这个问题，如果内存不足，我们打乱selectionKeys的顺序。

		**/
        for (SelectionKey key : determineHandlingOrder(selectionKeys)) {
			//获取对应的Channel的包装类 KafkaChannel
            KafkaChannel channel = channel(key);
            long channelStartTimeNanos = recordTimePerConnection ? time.nanoseconds() : 0;

            // register all per-connection metrics at once
            sensors.maybeRegisterConnectionMetrics(channel.id());
            if (idleExpiryManager != null)
                idleExpiryManager.update(channel.id(), currentTimeNanos);

            boolean sendFailed = false;
            try {

                /* 处理连接成功事件 */
                if (isImmediatelyConnected || key.isConnectable()) {
					/**
					在里面做连接初始化的工作：
						1.为通道注册读事件
						2、修改KafkaChannel的state连接状态属性
					**/
                    if (channel.finishConnect()) {
                        this.connected.add(channel.id());
                        this.sensors.connectionCreated.record();
                        SocketChannel socketChannel = (SocketChannel) key.channel();
                        log.debug("Created socket with SO_RCVBUF = {}, SO_SNDBUF = {}, SO_TIMEOUT = {} to node {}",
                                socketChannel.socket().getReceiveBufferSize(),
                                socketChannel.socket().getSendBufferSize(),
                                socketChannel.socket().getSoTimeout(),
                                channel.id());
                    } else
                        continue;
                }

                /* if channel is not ready finish prepare */
                if (channel.isConnected() && !channel.ready()) {
                    try {
                        channel.prepare();
                    } catch (AuthenticationException e) {
                        sensors.failedAuthentication.record();
                        throw e;
                    }
                    if (channel.ready())
                        sensors.successfulAuthentication.record();
                }
				
				//处理通道读事件
                attemptRead(key, channel);

				//检查如果有数据未读取完，把通道缓存在keysWithBufferedRead中
                if (channel.hasBytesBuffered()) {
                    //this channel has bytes enqueued in intermediary buffers that we could not read
                    //(possibly because no memory). it may be the case that the underlying socket will
                    //not come up in the next poll() and so we need to remember this channel for the
                    //next poll call otherwise data may be stuck in said buffers forever.
                    keysWithBufferedRead.add(key);
                }
				
				//处理通道写的事件，重点关注这块的逻辑， 发送消息是要往通道写数据
                /* if channel is ready write to any sockets that have space in their buffer and for which we have data */
                if (channel.ready() && key.isWritable()) {
                    Send send = null;
                    try {
						/**
							channel.write的工作原理：
							1、上面doSend代码说到生成NetworkSend并找到nodeId对应的KfkaChannel，并覆盖里面的send属性，
								这里会做的是调用send.writeTo(Channel channel) ,委派给ByteBufferSend做 channel.write(buffers)最终的数据写入操作
							2、如果数据全部写入完成，删除通道的写入事件
						**/
                        send = channel.write();
                    } catch (Exception e) {
                        sendFailed = true;
                        throw e;
                    }
                    if (send != null) {
                        this.completedSends.add(send);
                        this.sensors.recordBytesSent(channel.id(), send.size());
                    }
                }

                /* cancel any defunct sockets */
                if (!key.isValid())
                    close(channel, CloseMode.GRACEFUL);

            } catch (Exception e) {
                String desc = channel.socketDescription();
                if (e instanceof IOException)
                    log.debug("Connection with {} disconnected", desc, e);
                else if (e instanceof AuthenticationException) // will be logged later as error by clients
                    log.debug("Connection with {} disconnected due to authentication exception", desc, e);
                else
                    log.warn("Unexpected error from {}; closing connection", desc, e);
                close(channel, sendFailed ? CloseMode.NOTIFY_ONLY : CloseMode.GRACEFUL);
            } finally {
                maybeRecordTimePerConnection(channel, channelStartTimeNanos);
            }
        }
    }

到这里发送消息的操作都已经结束了，若你需要关注消息发送的结果，就需要往下解读 attemptRead方法，这个方法是做远程的数据读取操作的，我们继续往下看，我们继续解析源码看如何得到消息结果

attemptRead方法解析


 private void attemptRead(SelectionKey key, KafkaChannel channel) throws IOException {
        //当通道可读，并且当前通道对应的待处理的响应数据为空，则进行数据读取
        if (channel.ready() && (key.isReadable() || channel.hasBytesBuffered()) && !hasStagedReceive(channel)
            && !explicitlyMutedChannels.contains(channel)) {
            NetworkReceive networkReceive;
			/**
				从KafkaChannel内读取消息转换成NetworkReceive返回，这里为什么会用while而不用if呢，
				因为读取消息是需要开辟内存空间缓冲数据，当内存池不够空间开辟内存空间时，会返回空，所以这里需要
				循环读取，直到内存池有足够的空间分配给当前的缓冲区
			**/
            while ((networkReceive = channel.read()) != null) {
                madeReadProgressLastPoll = true;
				//把读取的数据NetworkReceive 追加到通道对应的待处理响应数据双向队列尾部
                addToStagedReceives(channel, networkReceive);
            }
            if (channel.isMute()) {
                outOfMemory = true; //channel has muted itself due to memory pressure.
            } else {
                madeReadProgressLastPoll = true;
            }
        }
}

channel.read()方法解析


public NetworkReceive read() throws IOException {
		//ByteBuffer响应数据包装类
        NetworkReceive result = null;
		//初始化NetworkReceive
        if (receive == null) {
            receive = new NetworkReceive(maxReceiveSize, id, memoryPool);
        }
		//从Channel内读取数据
        receive(receive);
		
		
        if (receive.complete()) {
			//如果读取成功需要为NetworkReceive内的ByteBuffer复位rewind()，因为读取数据会导致ByteBuffer的position=limit
			//需要复位后重新读取出数据做业务处理
            receive.payload().rewind();
            result = receive;
            receive = null;
        } else if (receive.requiredMemoryAmountKnown() && !receive.memoryAllocated() && isInMutableState()) {
            //pool must be out of memory, mute ourselves.
            mute();
        }
        return result;
}
	
	
/**
receive(receive)最终委派给readFromReadableChannel执行
**/	
public long readFromReadableChannel(ReadableByteChannel channel) throws IOException {
		//读取的数据包长度
        int read = 0;
		
		//如果size可读
        if (size.hasRemaining()) {
			//从Channel中读取一个4个字节数据， 响应数据头4个字节标识数据包的长度
            int bytesRead = channel.read(size);
			//如果读取4个字节失败抛出异常
            if (bytesRead < 0)
                throw new EOFException();
			//数据包长度+4
            read += bytesRead;
			//如果size不可操作， 证明size已经装满数据了
            if (!size.hasRemaining()) {
				//复位size，以便读取数据
                size.rewind();
				//得到数据包的长度
                int receiveSize = size.getInt();
				//如果数据包的长度小于0抛出异常
                if (receiveSize < 0)
                    throw new InvalidReceiveException("Invalid receive (size = " + receiveSize + ")");
				
                if (maxSize != UNLIMITED && receiveSize > maxSize)
                    throw new InvalidReceiveException("Invalid receive (size = " + receiveSize + " larger than " + maxSize + ")");
				
				//赋值数据包长度 requestedBufferSize
                requestedBufferSize = receiveSize; //may be 0 for some payloads (SASL)
				//如果数据包长度为0，直接复制buff属性为EMPTY_BUFFER
                if (receiveSize == 0) {
                    buffer = EMPTY_BUFFER;
                }
            }
        }
		
		//如果buffer等于空 并且 数据包长度requestedBufferSize!=-1
        if (buffer == null && requestedBufferSize != -1) { //we know the size we want but havent been able to allocate it yet
			//从内存池开辟一个数据包长度的缓冲区
            buffer = memoryPool.tryAllocate(requestedBufferSize);
			//如果buffer为空，证明内存不足，等待下一次读取
            if (buffer == null)
                log.trace("Broker low on memory - could not allocate buffer of size {} for source {}", requestedBufferSize, source);
        }
		//如果buffer不为空，从channel读取期望的数据
        if (buffer != null) {
            int bytesRead = channel.read(buffer);
			//如果读取的数据长度小于0抛异常
            if (bytesRead < 0)
                throw new EOFException();
			
			//数据包长度累加
            read += bytesRead;
        }
		
		//返回数据包长度
        return read;
}

addToStagedReceives方法解析


private void addToStagedReceives(KafkaChannel channel, NetworkReceive receive) {
        if (!stagedReceives.containsKey(channel))
            stagedReceives.put(channel, new ArrayDeque<NetworkReceive>());
		
		//从channel对应的双向队列尾部追加receive
        Deque<NetworkReceive> deque = stagedReceives.get(channel);
        deque.add(receive);
}

这里已经成功把读取的数据转换成Kakfa内部的NetworkReceive对象，并缓存到stagedReceives中，现在我们继续探究kafka如何用NetworkReceive得到消息的发送结果，回到到Sender线程发送消息网络IO的入口NetworkClient

网络IO的入口


public List<ClientResponse> poll(long timeout, long now) {
        if (!abortedSends.isEmpty()) {
            // If there are aborted sends because of unsupported version exceptions or disconnects,
            // handle them immediately without waiting for Selector#poll.
            List<ClientResponse> responses = new ArrayList<>();
            handleAbortedSends(responses);
            completeResponses(responses);
            return responses;
        }
        
       
        long metadataTimeout = metadataUpdater.maybeUpdate(now);
        try {
            //在该方法内我们已经做完数据的写入读取操作，并把读取的数据追加到Selector.completedReceives队列尾部
            this.selector.poll(Utils.min(timeout, metadataTimeout, requestTimeoutMs));
        } catch (IOException e) {
            log.error("Unexpected error during I/O", e);
        }

       
        long updatedNow = this.time.milliseconds();
        List<ClientResponse> responses = new ArrayList<>();
        //处理已发送成功的请求
        handleCompletedSends(responses, updatedNow);
         /**
        现在我们需要处理读取的数据，handleCompletedReceives就是做这个事情的
        **/
        //处理已读取成功的响应数据
        handleCompletedReceives(responses, updatedNow);
        
        //执行响应数据的回调函数
        completeResponses(responses);
       
        return responses;
    }

handleCompletedReceives解析


private void handleCompletedReceives(List<ClientResponse> responses, long now) {
	//遍历读取完成的数据队列
    for (NetworkReceive receive : this.selector.completedReceives()) {
		//brokerId ,标示数据来源于哪一个broker
        String source = receive.source();
		//从brokerId对应的请求队列尾部移除一个InFlightRequest并返回
        InFlightRequest req = inFlightRequests.completeNext(source);
		//使用请求头，和响应数据ByteBuffer构造Struct
        Struct responseStruct = parseStructMaybeUpdateThrottleTimeMetrics(receive.payload(), req.header,
            throttleTimeSensor, now);
        if (log.isTraceEnabled()) {
            log.trace("Completed receive from node {} for {} with correlation id {}, received {}", req.destination,
                req.header.apiKey(), req.header.correlationId(), responseStruct);
        }
		//根据请求头的ApiKeys枚举，返回对应的响应实体AbstractResponse，发送消息的响应结果为 ProduceResponse
        AbstractResponse body = AbstractResponse.parseResponse(req.header.apiKey(), responseStruct);
		
		//元数据请求走metadataUpdater.handleCompletedMetadataResponse
        if (req.isInternalRequest && body instanceof MetadataResponse)
            metadataUpdater.handleCompletedMetadataResponse(req.header, now, (MetadataResponse) body);
        else if (req.isInternalRequest && body instanceof ApiVersionsResponse)
            handleApiVersionsResponse(responses, req, now, (ApiVersionsResponse) body);
        else
			//发送消息的处理走这里，生成ClientResponse追加到responses列表中，封装有请求头，回调函数，brokerId、创建时间
            responses.add(req.completed(body, now));
    }
}

completeResponses 解析


private void completeResponses(List<ClientResponse> responses) {
	/**
		遍历每一个响应结果
	**/
    for (ClientResponse response : responses) {
        try {
			/**
				执行请求的回调函数,还记得我们在构造发送消息请求时创建的回调函数吗?
				RequestCompletionHandler callback = new RequestCompletionHandler() {
					public void onComplete(ClientResponse response) {
						handleProduceResponse(response, recordsByPartition, time.milliseconds());
					}
				};
        
			**/
            response.onComplete();
        } catch (Exception e) {
            log.error("Uncaught error in request completion:", e);
        }
    }
}


/**
handleProduceResponse解析，由于篇幅有限，这里只讲解成功的部分
**/
private void handleProduceResponse(ClientResponse response, Map<TopicPartition, ProducerBatch> batches, long now) {
	
	//响应数据不为空
	if (response.hasResponse()) {
				//转换成ProduceResponse
                ProduceResponse produceResponse = (ProduceResponse) response.responseBody();
                for (Map.Entry<TopicPartition, ProduceResponse.PartitionResponse> entry : produceResponse.responses().entrySet()) {
					//响应结果Map<TopicPartition,PartitionResponse>
                    TopicPartition tp = entry.getKey();
                    ProduceResponse.PartitionResponse partResp = entry.getValue();
					//获取主题对应ProducerBatch
                    ProducerBatch batch = batches.get(tp);
					//释放ProducerBatch
                    completeBatch(batch, partResp, correlationId, now);
                }
                this.sensors.recordLatency(response.destination(), response.requestLatencyMs());
    } 
	
}

/**
completeBatch解析， 由于篇幅有限，这里只讲解成功的部分

**/

private void completeBatch(ProducerBatch batch, ProduceResponse.PartitionResponse response) {
	 
	 if (batch.done(response.baseOffset, response.logAppendTime, null))
            this.accumulator.deallocate(batch);	
}

//done解析
public boolean done(long baseOffset, long logAppendTime, RuntimeException exception) {
		//执行用户线程在发送消息时穿参的回调org.apache.kafka.clients.producer.Callback ,并释放produceFuture，让用户线程得到消息的偏移量
		
        completeFutureAndFireCallbacks(baseOffset, logAppendTime, exception);
        return true;
}


//completeFutureAndFireCallbacks解析
private void completeFutureAndFireCallbacks(long baseOffset, long logAppendTime, RuntimeException exception) {
        // 设置结果集
        produceFuture.set(baseOffset, logAppendTime, exception);

        // 执行回调函数
        for (Thunk thunk : thunks) {
            try {
                if (exception == null) {
                    RecordMetadata metadata = thunk.future.value();
                    if (thunk.callback != null)
                        thunk.callback.onCompletion(metadata, null);
                } else {
                    if (thunk.callback != null)
                        thunk.callback.onCompletion(null, exception);
                }
            } catch (Exception e) {
                log.error("Error executing user-provided callback on message for topic-partition '{}'", topicPartition, e);
            }
        }
		
		//释放锁
        produceFuture.done();
    }

麻布二狗

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
KafkaProducer源码分析

架构图doSend 方法源码解析private Future<RecordMetadata> doSend(ProducerRecord<K, V> record, Callback callback) { TopicPartition tp = null; /** 方法中做的事情是：获取主题的集群元数据信息，producer在发送消息会查询本地缓存是否有元数据信.
复制链接

扫一扫