＜KAFKA技术内幕＞解读

不留痕迹的随风飘荡

已于 2024-05-08 19:03:18 修改

阅读量161

点赞数

文章标签： kafka java jvm

于 2023-04-16 09:25:54 首次发布

本文链接：https://blog.csdn.net/weixin_60786515/article/details/130178803

版权

一.框架搭建参考其他文章【Kafka从成神到升仙系列一】Kafka源码环境搭建 - 掘金 (juejin.cn)

二生产者发送消息

入口Producer send 方法

  public void run() {
        int messageNo = 1;
        while (true) {
            String messageStr = "Message_" + messageNo;
            long startTime = System.currentTimeMillis();
            // true表示异步发送
            if (isAsync) { // Send asynchronously
                producer.send(new ProducerRecord<>(topic,
                    messageNo,
                    messageStr), new DemoCallBack(startTime, messageNo, messageStr));
            } else { // Send synchronously
                // 同步发送
                try {
                    producer.send(new ProducerRecord<>(topic,
                        messageNo,
                        messageStr)).get();
                    System.out.println("Sent message: (" + messageNo + ", " + messageStr + ")");
                } catch (InterruptedException | ExecutionException e) {
                    e.printStackTrace();
                }
            }
            ++messageNo;
        }
    }

调用doSend 大概有七个步骤先获取原消息然后 2.对消息key value序列化

3.计算分区原消息发送到那个分区（下面细说）4.确认消息大少是否超过最大值，5.根据消息封装到分区对象，6.给每一条消息绑定回调函数（异步发送才有，同步没有）7.把消息导入到消息推送器（下面细说）8.唤醒send线程（消息发送，回调处理）处理模式有点像Reactor模式。

 /**
     * Implementation of asynchronously send a record to a topic.
     */
    private Future<RecordMetadata> doSend(ProducerRecord<K, V> record, Callback callback) {
        TopicPartition tp = null;
        try {
            // first make sure the metadata for the topic is available
            /**
             * 步骤一：
             *      同步等待拉取元数据。
             *  maxBlockTimeMs 最多能等待多久。
             */
            ClusterAndWaitTime clusterAndWaitTime = waitOnMetadata(record.topic(), record.partition(), maxBlockTimeMs);
            long remainingWaitMs = Math.max(0, maxBlockTimeMs - clusterAndWaitTime.waitedOnMetadataMs);
            Cluster cluster = clusterAndWaitTime.cluster;
            /**
             * 步骤二：
             *  对消息的key和value进行序列化。
             */
            byte[] serializedKey;
            try {
                serializedKey = keySerializer.serialize(record.topic(), record.key());
            } catch (ClassCastException cce) {
                throw new SerializationException("Can't convert key of class " + record.key().getClass().getName() +
                        " to class " + producerConfig.getClass(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG).getName() +
                        " specified in key.serializer");
            }
            byte[] serializedValue;
            try {
                serializedValue = valueSerializer.serialize(record.topic(), record.value());
            } catch (ClassCastException cce) {
                throw new SerializationException("Can't convert value of class " + record.value().getClass().getName() +
                        " to class " + producerConfig.getClass(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG).getName() +
                        " specified in value.serializer");
            }
            /**
             * 步骤三：
             *  根据分区器选择消息应该发送的分区。
             *
             *  因为前面我们已经获取到了元数据
             *  这儿我们就可以根据元数据的信息
             *  计算一下，我们应该要把这个数据发送到哪个分区上面。
             */
            int partition = partition(record, serializedKey, serializedValue, cluster);
            /**
             *
			 * 计算消息记录的总大小
			 * Records.LOG_OVERHEAD = SIZE_LENGTH（值为4） + OFFSET_LENGTH（值为8）
			 * Records.LOG_OVERHEAD有SIZE_LENGTH和OFFSET_LENGTH两个字段，分别表示存放消息长度和消息偏移量所需要的字节数
			 */
            int serializedSize = Records.LOG_OVERHEAD + Record.recordSize(serializedKey, serializedValue);

            /**
             * 步骤四：
             *  确认一下消息的大小是否超过了最大值。
             *  KafkaProdcuer初始化的时候，指定了一个参数，代表的是Producer这儿最大能发送的是一条消息能有多大
             *  默认最大是1M，我们一般都会去修改它。
             */
            ensureValidRecordSize(serializedSize);
            /**
             * 步骤五：
             *  根据元数据信息，封装分区对象
             */
            tp = new TopicPartition(record.topic(), partition);
            long timestamp = record.timestamp() == null ? time.milliseconds() : record.timestamp();
            log.trace("Sending record {} with callback {} to topic {} partition {}", record, callback, record.topic(), partition);
            // producer callback will make sure to call both 'callback' and interceptor callback
            /**
             * 步骤六：
             *  给每一条消息都绑定他的回调函数。因为我们使用的是异步的方式发送的消息。
             */
            Callback interceptCallback = this.interceptors == null ? callback : new InterceptorCallback<>(callback, this.interceptors, tp);
            /**
             * 步骤七：
             *  把消息放入accumulator（32M的一个内存）
             *  然后有accumulator把消息封装成为一个批次一个批次的去发送。
             */
            RecordAccumulator.RecordAppendResult result = accumulator.append(tp, timestamp, serializedKey, serializedValue, interceptCallback, remainingWaitMs);
            // 如果达到批次要求
            if (result.batchIsFull || result.newBatchCreated) {
                log.trace("Waking up the sender since topic {} partition {} is either full or getting a new batch", record.topic(), partition);
                /**
                 * 步骤八:
                 *  唤醒sender线程。他才是真正发送数据的线程。
                 */
                this.sender.wakeup();
            }
            return result.future;
            // handling exceptions and record the errors;
            // for API exceptions return them in the future,
            // for other exceptions throw directly
        } catch (ApiException e) {
            log.debug("Exception occurred during message send:", e);
            if (callback != null)
                callback.onCompletion(null, e);
            this.errors.record();
            if (this.interceptors != null)
                this.interceptors.onSendError(record, tp, e);
            return new FutureFailure(e);
        } catch (InterruptedException e) {
            this.errors.record();
            if (this.interceptors != null)
                this.interceptors.onSendError(record, tp, e);
            throw new InterruptException(e);
        } catch (BufferExhaustedException e) {
            this.errors.record();
            this.metrics.sensor("buffer-exhausted-records").record();
            if (this.interceptors != null)
                this.interceptors.onSendError(record, tp, e);
            throw e;
        } catch (KafkaException e) {
            this.errors.record();
            if (this.interceptors != null)
                this.interceptors.onSendError(record, tp, e);
            throw e;
        } catch (Exception e) {
            // we notify interceptor about all exceptions, since onSend is called before anything else in this method
            if (this.interceptors != null)
                this.interceptors.onSendError(record, tp, e);
            throw e;
        }
    }

4.细说部分

一、计算分区原消息发送到那个分区（三种规则下面细说）

1.指明 partition 的情况下，直接将指明的值直接作为 partiton 值；指定分区配置属性配置

 private int partition(ProducerRecord<K, V> record, byte[] serializedKey, byte[] serializedValue, Cluster cluster) {
        // 如果你的这个消息已经分配了分区号，那直接就用这个分区号就可以了
        // 但是正常情况下，消息是没有分区号的。
        Integer partition = record.partition();
        return partition != null ?
                partition :
               // 使用分区器进行选择合适的分区
                partitioner.partition(
                        record.topic(), record.key(), serializedKey, record.value(), serializedValue, cluster);
    }

2.没有指明 partition 值但有 key 的情况下，将 key 的 hash 值与 topic 的 partition 数进行取余得到 partition 值；

3.既没有 partition 值又没有 key 值的情况下，第一次调用时随机生成一个整数（后面每次调用在这个整数上自增），将这个值与 topic 可用的 partition 总数取余得到 partition 值，也就是常说的 round-robin 算法。

 public int partition(String topic, Object key, byte[] keyBytes, Object value, byte[] valueBytes, Cluster cluster) {
        // 获取集群中指定topic的分区信息
        List<PartitionInfo> partitions = cluster.partitionsForTopic(topic);
        int numPartitions = partitions.size();
        if (keyBytes == null) {
            //策略一： 如果发送消息的时候，没有指定key 轮询
            // 获取counter并自增，counter是个原子类
            int nextValue = nextValue(topic);
            // 获取可用分区
            List<PartitionInfo> availablePartitions = cluster.availablePartitionsForTopic(topic);
            if (availablePartitions.size() > 0) {
                int part = Utils.toPositive(nextValue) % availablePartitions.size();
                return availablePartitions.get(part).partition();
            } else {
                // no partitions are available, give a non-available partition
                // 没有可用分区，直接给一个不可用分区
                return Utils.toPositive(nextValue) % numPartitions;
            }
        } else {
            /** 策略二：这个地方就是指定了key
             *  hash the keyBytes to choose a partition
             *  直接对key取一个hash值 % 分区的总数取模
             *  如果是同一个key，计算出来的分区肯定是同一个分区。
             *  如果我们想要让消息能发送到同一个分区上面，那么我们就
             *  必须指定key. 这一点非常重要
             *  murmur2是一种高效率低碰撞的Hash算法
            */
            return Utils.toPositive(Utils.murmur2(keyBytes)) % numPartitions;
        }
    }

二、把消息导入到消息推送器

1.消息发送是消息队列满就会连接服务器准备发送消息

2.追加消息到记录收集器的数据结构是batches TopicPatition到 Deque<Reco dBatch＞，读取记录
收集器的数据结构是batches:Nodeld 到List<RecodBatch>

3. 获取该 topic-partition 对应的 queue，没有的话会创建一个空的 queue，向 queue 中追加数据，先获取 queue 中最新加入的那个 RecordBatch，如果不存在或者存在但剩余空余不足以添加本条 record 则返回 null，成功写入的话直接返回结果，写入成功，创建一个新的 RecordBatch，初始化内存大小根据 max(batch.size, Records.LOG_OVERHEAD + Record.recordSize(key, value)) 来确定（防止单条 record 过大的情况，向新建的 RecordBatch 写入 record，并将 RecordBatch 添加到 queue 中，返回结果，写入成功

 public RecordAppendResult append(TopicPartition tp,
                                     long timestamp,
                                     byte[] key,
                                     byte[] value,
                                     Callback callback,
                                     long maxTimeToBlock) throws InterruptedException {
        // We keep track of the number of appending thread to make sure we do not miss batches in
        // abortIncompleteBatches().
        // 统计正在向RecordAccumulator中追加数据的线程数
        appendsInProgress.incrementAndGet();
        try {
            // check if we have an in-progress batch

            /**
             * 步骤一：先根据分区找到应该插入到哪个队列里面。
             * 如果有已经存在的队列，那么我们就使用存在队列
             * 如果队列不存在，那么我们新创建一个队列
             *
             * 我们肯定是有了存储批次的队列，但是大家一定要知道一个事
             * 我们代码第一次执行到这儿，获取其实就是一个空的队列。
             *
             * 现在代码第二次执行进来。
             * 假设 分区还是之前的那个分区。
             *
             * 这个方法里面我们之前分析，里面就是针对batchs进行的操作
             * 里面kafka自己封装了一个数据结构：CopyOnWriteMap (这个数据结构本来就是线程安全的)
             */
            Deque<RecordBatch> dq = getOrCreateDeque(tp);
            // 同步操作，以Deque为锁
            /**
             * 假设我们现在有线程一，线程二，线程三
             */
            synchronized (dq) {
                // 检查生产者是否已经关闭了
                //首先进来的是第一个线程
                if (closed)
                    throw new IllegalStateException("Cannot send after the producer is closed.");
                /**
                 * 步骤二：
                 *      尝试往队列里面的批次里添加数据
                 *
                 *      一开始添加数据肯定是失败的，我们目前只是有了队列
                 *      数据是需要存储在批次对象里面（这个批次对象是需要分配内存的）
                 *      我们目前还没有分配内存，所以如果按场景驱动的方式，
                 *      代码第一次运行到这儿其实是不成功的。
                 */
                RecordAppendResult appendResult = tryAppend(timestamp, key, value, callback, dq);
                //第一次进来的时候appendResult的值就为null
                if (appendResult != null)
                    return appendResult;
            }//释放锁

            // we don't have an in-progress record batch try to allocate a new batch
            /**
             * 步骤三：计算一个批次的大小
             * 在消息的大小和批次的大小之间取一个最大值，用这个值作为当前这个批次的大小。
             * 有可能我们的一个消息的大小比一个设定好的批次的大小还要大。
             * 默认一个批次的大小是16K。
             * 所以我们看到这段代码以后，应该给我们一个启示。
             * 如果我们生产者发送数的时候，如果我们的消息的大小都是超过16K，
             * 说明其实就是一条消息就是一个批次，那也就是说消息是一条一条被发送出去的。
             * 那如果是这样的话，批次这个概念的设计就没有意义了
             * 所以大家一定要根据自己公司的数据大小的情况去设置批次的大小。
             */
            int size = Math.max(this.batchSize, Records.LOG_OVERHEAD + Record.recordSize(key, value));
            log.trace("Allocating a new {} byte message buffer for topic {} partition {}", size, tp.topic(), tp.partition());
            /**
             * 步骤四：
             *  根据批次的大小去分配内存
             *
             *  线程一，线程二，线程三，执行到这儿都会申请内存
             *  假设每个线程 都申请了 16k的内存。
             */
            ByteBuffer buffer = free.allocate(size, maxTimeToBlock);
            synchronized (dq) {
                //假设线程一 进来了。
                //线程二就进来了
                // Need to check if producer is closed again after grabbing the dequeue lock.
                if (closed)
                    throw new IllegalStateException("Cannot send after the producer is closed.");
                /**
                 * 步骤五：
                 *      尝试把数据写入到批次里面。
                 *      代码第一次执行到这儿的时候 依然还是失败的（appendResult==null）
                 *      目前虽然已经分配了内存
                 *      但是还没有创建批次，那我们向往批次里面写数据
                 *      还是不能写的。
                 *
                 *   线程二进来执行这段代码的时候，是成功的。
                 */
                RecordAppendResult appendResult = tryAppend(timestamp, key, value, callback, dq);
                //失败的意思就是appendResult 还是会等于null
                if (appendResult != null) {
                    // Somebody else found us a batch, return the one we waited for! Hopefully this doesn't happen often...
                    //释放内存

                    //线程二到这儿，其实他自己已经把数据写到批次了。所以
                    //他的内存就没有什么用了，就把内存个释放了（还给内存池了。
                    free.deallocate(buffer);
                    return appendResult;
                }

                /**
                 * 步骤六：
                 *  根据内存大小封装批次
                 *
                 *  线程一到这儿 会根据内存封装出来一个批次。
                 */
                MemoryRecordsBuilder recordsBuilder = MemoryRecords.builder(buffer, compression, TimestampType.CREATE_TIME, this.batchSize);
                // 使用传入的TopicPartition参数和records新创建一个RecordBatch
                RecordBatch batch = new RecordBatch(tp, recordsBuilder, time.milliseconds());
                //尝试往这个批次里面写数据，到这个时候 我们的代码会执行成功。

                //线程一，就往批次里面写数据，这个时候就写成功了。
                FutureRecordMetadata future = Utils.notNull(batch.tryAppend(timestamp, key, value, callback, time.milliseconds()));
                /**
                 * 步骤七：
                 *  把这个批次放入到这个队列的队尾
                 *
                 *  线程一 把批次添加到队尾
                 */
                dq.addLast(batch);
                incomplete.add(batch);
                return new RecordAppendResult(future, dq.size() > 1 || batch.isFull(), true);
            }
        } finally {
            // 将记录正在追加消息的线程数的计数器减1
            appendsInProgress.decrementAndGet();
        }
    }

下篇细说真正发送逻辑