RecordAccumulator分析

最新推荐文章于 2024-08-22 20:59:44 发布

莫言静好、

最新推荐文章于 2024-08-22 20:59:44 发布

阅读量2.2k

点赞数

分类专栏：大数据/kafka/源码文章标签： kafka RecordAccumulator 源码

本文链接：https://blog.csdn.net/zhanglh046/article/details/72845531

版权

大数据/kafka/源码专栏收录该内容

50 篇文章 3 订阅

订阅专栏

它主要作用就是相当于一个队列，相当于一个缓冲区，用于储蓄record到MemoryRecords，然后被发送到服务器

一核心字段

int batchSize: 批量大小

CompressionType compression: 压缩类型

long lingerMs: 延迟时间

long retryBackoffMs: 重试时间

BufferPool free: ByteBuffer缓冲池，只要大小满足条件，就可以复用缓冲池里的ByteBuffer

ConcurrentMap<TopicPartition,Deque<RecordBatch>> batches: TopicPartition和RecordBatch队列的映射关系

IncompleteRecordBatches incomplete: 处于完成状态的批量记录

int drainIndex: 使用drain方法导出RecordBatch的时候，为了防止饥饿，drainIndex就记录上次发送停止的位置，下次继续从此位置开始

二重要方法

2.1 append 添加record到accumulator，并返回这个添加结果，添加结果主要包含元数据

public RecordAppendResult append(TopicPartition tp, long timestamp, byte[] key, byte[] value,
    Callback callback, long maxTimeToBlock) throws InterruptedException{
    appendsInProgress.incrementAndGet();
    try {
        // 获取一个双向队列,没有则创建
        Deque<RecordBatch> dq = getOrCreateDeque(tp);
        // 由于队列并不是线程安全的所以需要同步
        synchronized (dq) {
            if (closed)
                throw new IllegalStateException("Cannotsend after the producer is closed.");
            // 试图向双向队列添加数据
            RecordAppendResultappendResult = tryAppend(timestamp, key, value, callback, dq);
            // 如果返回结果不为空则返回
            if (appendResult != null)
                return appendResult;
        }

        int size = Math.max(this.batchSize, Records.LOG_OVERHEAD + Record.recordSize(key, value));
        log.trace("Allocating a new {} bytemessage buffer for topic {} partition {}", size, tp.topic(), tp.partition());
        // 追加失败，则从BufferPool申请新的空间
        ByteBuffer buffer = free.allocate(size, maxTimeToBlock);
        // 再次尝试添加
        synchronized (dq) {
            // 需要检测生产者是否已经关闭
            if (closed)
                throw new IllegalStateException("Cannotsend after the producer is closed.");
            // 试图添加record到MemoryRecord,如果添加失败，内存可能满了，重置position等和关闭buffer防止继续写入
            RecordAppendResultappendResult = tryAppend(timestamp, key, value, callback, dq);
            // 如果添加成功
            if (appendResult != null) {
                // 释放申请的空间，并且返回结果
                free.deallocate(buffer);
                return appendResult;
            }
            // 添加仍然不成功，可能是MemoryRecord已经满了，内存不够了，就会创建一个新的MemoryRecord
            MemoryRecords records = MemoryRecords.emptyRecords(buffer, compression, this.batchSize);
            // 创建RecordBatch，并且追加records
            RecordBatch batch = new RecordBatch(tp, records, time.milliseconds());
            // 再次试图添加
            FutureRecordMetadatafuture = Utils.notNull(batch.tryAppend(timestamp, key, value, callback, time.milliseconds()));
            // 把刚才创建的RecordBatch添加到双向队列末尾
            dq.addLast(batch);
            // 把这个新创建的ReocrdBatch追加到incomplete集合
            incomplete.add(batch);
            // 返回RecordAppendResult
            return new RecordAppendResult(future, dq.size() > 1 || batch.records.isFull(), true);
        }
    } finally {
        appendsInProgress.decrementAndGet();
    }
}

2.2 tryAppend 尝试向MemoryRecords中添加

private RecordAppendResult tryAppend(long timestamp, byte[] key, byte[] value, Callback callback, Deque<RecordBatch> deque) {
    // 取出双向队列最后一个元素，但是不会删除它
    RecordBatch last = deque.peekLast();
    // 试图将取出来的最后一个元素添加
    if (last != null) {
        // 调用RecordBatch.truAppend方法，添加到MemoryRecords的buffer里
        FutureRecordMetadata future = last.tryAppend(timestamp, key, value, callback, time.milliseconds());
        // 表示MemoryRecord已经放不下了,然后flip将position置为0，当前buffer不可写，返回
        if (future == null)
            last.records.close();
        else // 直接返回添加结果
            return new RecordAppendResult(future, deque.size() > 1 || last.records.isFull(), false);
    }
    return null;
}

2.3 abortBatches

private void abortBatches() {
    // 遍历已经处于完成状态的RecordBatch
    for (RecordBatch batch : incomplete.all()) {
        // 根据TopicPartition获取对应的Deque<RecordBatch>队列
        Deque<RecordBatch> dq = getDeque(batch.topicPartition);
        // 在中断之前关闭队列
        synchronized (dq) {
            // 关闭MemoryRecord不再添加
            batch.records.close();
            // 从队列移除这个RecordBatch
            dq.remove(batch);
        }
        // 执行RecordBatch中所有消息的回调，并且标记这个生产者请求已经完成
        batch.done(-1L, Record.NO_TIMESTAMP, new IllegalStateException("Producer is closed forcefully."));
        // 从incomplete集合中移除当前batch，并且释放分配的ByteBuffer
        deallocate(batch);
    }
}

2.4 ready 主要就是判断分区里RecordBatch是否满足发送条件，一旦条件满足则添加到一个可以向哪些节点发送消息的集合

public ReadyCheckResult ready(Cluster cluster, long nowMs) {
    // 用来保存向哪些Node节点发送信息
    Set<Node> readyNodes = new HashSet<>();
    // 下一次需要调用ready方法的时间间隔
    long nextReadyCheckDelayMs = Long.MAX_VALUE;
    // 根据Metadata元数据中找不到leader副本的topic的集合
    Set<String> unknownLeaderTopics = new HashSet<>();
    // 是否有线程在阻塞等待BufferPool释放空间
    boolean exhausted = this.free.queued() > 0;
    // 遍历每一个分区和RecordBatch队列映射集合
    for (Map.Entry<TopicPartition, Deque<RecordBatch>> entry : this.batches.entrySet()) {
        // 获取TopicPartition
        TopicPartition part = entry.getKey();
        // 获取RecordBatch队列
        Deque<RecordBatch> deque = entry.getValue();
        // 根据分区找到leader副本位于哪一个节点
        Node leader = cluster.leaderFor(part);
        synchronized (deque) {
            // leader副本为空但是队列不为空
            if (leader == null && !deque.isEmpty()) {
                // 添加该分区的topic到未知leader的topic集合
                unknownLeaderTopics.add(part.topic());
            }
            // 如果leader节点没有准备好且muted不包含这个分区
            else if (!readyNodes.contains(leader) && !muted.contains(part)) {
                // 获取队首的RecordBatch
                RecordBatch batch = deque.peekFirst();
                if (batch != null) {
                    boolean backingOff = batch.attempts > 0 && batch.lastAttemptMs + retryBackoffMs > nowMs;
                    long waitedTimeMs = nowMs - batch.lastAttemptMs;
                    long timeToWaitMs = backingOff ? retryBackoffMs : lingerMs;
                    long timeLeftMs = Math.max(timeToWaitMs - waitedTimeMs, 0);
                    boolean full = deque.size() > 1 || batch.records.isFull();
                    boolean expired = waitedTimeMs >= timeToWaitMs;
                    // 查看是否满足发送条件，满足其一即可
                    boolean sendable = full || expired || exhausted || closed || flushInProgress();
                    if (sendable && !backingOff) {
                        // 如果可以发送且没有重新尝试发送，添加到可以发送节点
                        readyNodes.add(leader);
                    } else {
                        // 更新下一次需要调用ready方法的时间间隔
                        nextReadyCheckDelayMs = Math.min(timeLeftMs, nextReadyCheckDelayMs);
                    }
                }
            }
        }
    }
    // 返回ReadyCheckResult
    return new ReadyCheckResult(readyNodes, nextReadyCheckDelayMs, unknownLeaderTopics);
}

2.5 drain 会根据ready方法获取readyNodes集和，然后该方法由sender线程调用，将TopicPartition -> RecordBatch转换成NodeId->

RecordBatch集合的映射

public Map<Integer, List<RecordBatch>> drain(Cluster cluster, Set<Node> nodes, int maxSize, long now) {
    // 判断节点集合是否为空
    if (nodes.isEmpty())
        return Collections.emptyMap();
    // 创建一个存储节点id和RecordBatch列表的映射
    Map<Integer, List<RecordBatch>> batches = new HashMap<>();
    // 遍历集合
    for (Node node : nodes) {
        int size = 0;
        // 获取该node的所有分区信息
        List<PartitionInfo> parts = cluster.partitionsForNode(node.id());
        // 用于保存要发送的RecordBatch的列表
        List<RecordBatch> ready = new ArrayList<>();
        // drainIndex记录上次发送停止的位置，下一次继续从此位置开始发送，若一直从索引0的队列开始发送，可能会出现一直只发送前几个
        // 分区的消息的情况，造成其他分区饥饿
        // 计算开始位置
        int start = drainIndex = drainIndex % parts.size();
        do {
            PartitionInfo part = parts.get(drainIndex);
            TopicPartition tp = new TopicPartition(part.topic(), part.partition());
            // Only proceed if the partition has no in-flight batches.
            if (!muted.contains(tp)) {
                // 更具TopicPartition获取队列
                Deque<RecordBatch> deque = getDeque(new TopicPartition(part.topic(), part.partition()));
                if (deque != null) {
                    synchronized (deque) {
                        // 取出队列第一个元素
                        RecordBatch first = deque.peekFirst();
                        // 第一个元素不为空
                        if (first != null) {
                            // 判断是否是重新发送
                            boolean backoff = first.attempts > 0 && first.lastAttemptMs + retryBackoffMs > now;
                            if (!backoff) {
                                if (size + first.records.sizeInBytes() > maxSize && !ready.isEmpty()) {
                                    // 数据量已满结束循环
                                    break;
                                } else {
                                    // 从队列中获取一个RecordBatch,并将这个RecordBatch放到ready集合
                                    // 每一个TopicPartition只取一个RecordBatch
                                    RecordBatch batch = deque.pollFirst();
                                    // 关闭Compressor，并将MemoryRecord放掉ready集合中
                                    batch.records.close();
                                    size += batch.records.sizeInBytes();
                                    ready.add(batch);
                                    batch.drainedMs = now;
                                }
                            }
                        }
                    }
                }
            }
            // 更新drainIndex
            this.drainIndex = (this.drainIndex + 1) % parts.size();
        } while (start != drainIndex);
        // 记录node id 和RecordBatch的对应关系
        batches.put(node.id(), ready);
    }
    return batches;
}