

public class Fetcher<K, V> {
    private final ConsumerNetworkClient client;
    private final Time time;
    private final int minBytes;
    private final int maxWaitMs;
    private final int fetchSize;
    private final long retryBackoffMs;
    private final int maxPollRecords;
    private final boolean checkCrcs;
    private final Metadata metadata;
    private final FetchManagerMetrics sensors;
    private final SubscriptionState subscriptions;
    private final List<CompletedFetch> completedFetches;
    private final Deserializer<K> keyDeserializer;
    private final Deserializer<V> valueDeserializer;
        private long fetchOffset;                       //records中第一个消息offset
        private TopicPartition partition;               //对应的TopicPartition
        private List<ConsumerRecord<K, V>> records;     //消息集合
    private PartitionRecords<K, V> nextInLineRecords = null;


private Map<Node, FetchRequest> createFetchRequests() {
    // 获metadata元数据
    Cluster cluster = metadata.fetch();
    Map<Node, Map<TopicPartition, FetchRequest.PartitionData>> fetchable = new HashMap<>();
    //1. 分配给当前消费者的分区,即SubscriptionState.assign集合中有对应记录的分区
    //2. 分区未被标记为暂停,且对应的TopicPartitionState.position不为空
    //3. nextInLineRecords中没有来自此分区的消息
    //4. completedFetches中没有来自此分区的消息
    for (TopicPartition partition : fetchablePartitions()) {
        Node node = cluster.leaderFor(partition);
        if (node == null) {
        } // 如果这个节点还有unsent集合或者InFlightRquest中的对应请求队列不为空,就不对此Node发送FetchRequest请求
        else if (this.client.pendingRequestCount(node) == 0) {
            Map<TopicPartition, FetchRequest.PartitionData> fetch = fetchable.get(node);
            if (fetch == null) {
                fetch = new HashMap<>();
                fetchable.put(node, fetch);
            long position = this.subscriptions.position(partition);
            fetch.put(partition, new FetchRequest.PartitionData(position, this.fetchSize));
            log.trace("Added fetch request for partition {} at offset {}", partition, position);
    // 对上面的fetchable集合进行转换,将发送同一个Node节点的所有TopicPartition的position信息封装成一个FetchRequest对象。
    Map<Node, FetchRequest> requests = new HashMap<>();
    for (Map.Entry<Node, Map<TopicPartition, FetchRequest.PartitionData>> entry : fetchable.entrySet()) {
        Node node = entry.getKey();
        FetchRequest fetch = new FetchRequest(this.maxWaitMs, this.minBytes, entry.getValue());
        requests.put(node, fetch);
    return requests;


public void sendFetches() {
    for (Map.Entry<Node, FetchRequest> fetchEntry: createFetchRequests().entrySet()) {
        final FetchRequest request = fetchEntry.getValue();
        client.send(fetchEntry.getKey(), ApiKeys.FETCH, request)
                .addListener(new RequestFutureListener<ClientResponse>() {
                    public void onSuccess(ClientResponse resp) {
                        FetchResponse response = new FetchResponse(resp.responseBody());
                        Set<TopicPartition> partitions = new HashSet<>(response.responseData().keySet());
                        FetchResponseMetricAggregator metricAggregator = new FetchResponseMetricAggregator(sensors, partitions);

                        for (Map.Entry<TopicPartition, FetchResponse.PartitionData> entry : response.responseData().entrySet()) {
                            TopicPartition partition = entry.getKey();
                            long fetchOffset = request.fetchData().get(partition).offset;
                            FetchResponse.PartitionData fetchData = entry.getValue();
                            completedFetches.add(new CompletedFetch(partition, fetchOffset, fetchData, metricAggregator));


                    public void onFailure(RuntimeException e) {
                        log.debug("Fetch failed", e);


 * Return the fetched records, empty the record buffer and update the consumed position.
 * NOTE: returning empty records guarantees the consumed position are NOT updated.
public Map<TopicPartition, List<ConsumerRecord<K, V>>> fetchedRecords() {
    if (this.subscriptions.partitionAssignmentNeeded()) {
        return Collections.emptyMap();
    } else {
        Map<TopicPartition, List<ConsumerRecord<K, V>>> drained = new HashMap<>();
        int recordsRemaining = maxPollRecords;
        Iterator<CompletedFetch> completedFetchesIterator = completedFetches.iterator();

        while (recordsRemaining > 0) {//遍历completedFetches集合
            if (nextInLineRecords == null || nextInLineRecords.isEmpty()) {
                if (!completedFetchesIterator.hasNext())

                CompletedFetch completion = completedFetchesIterator.next();
                nextInLineRecords = parseFetchedData(completion);
            } else {
                recordsRemaining -= append(drained, nextInLineRecords, recordsRemaining);
        return drained;
 * The callback for fetch completion
private PartitionRecords<K, V> parseFetchedData(CompletedFetch completedFetch) {
    TopicPartition tp = completedFetch.partition;
    FetchResponse.PartitionData partition = completedFetch.partitionData;
    long fetchOffset = completedFetch.fetchedOffset;
    int bytes = 0;
    int recordsCount = 0;
    PartitionRecords<K, V> parsedRecords = null;

    try {
        if (!subscriptions.isFetchable(tp)) {
            // this can happen when a rebalance happened or a partition consumption paused
            // while fetch is still in-flight
            log.debug("Ignoring fetched records for partition {} since it is no longer fetchable", tp);
        } else if (partition.errorCode == Errors.NONE.code()) {
            // we are interested in this fetch only if the beginning offset matches the
            // current consumed position
            Long position = subscriptions.position(tp);
            if (position == null || position != fetchOffset) {
                log.debug("Discarding stale fetch response for partition {} since its offset {} does not match " +
                        "the expected offset {}", tp, fetchOffset, position);
                return null;

            ByteBuffer buffer = partition.recordSet;
            MemoryRecords records = MemoryRecords.readableRecords(buffer);
            List<ConsumerRecord<K, V>> parsed = new ArrayList<>();
            boolean skippedRecords = false;
            for (LogEntry logEntry : records) {
                // Skip the messages earlier than current position.
                // 跳过早于position的消息
                if (logEntry.offset() >= position) {
                    parsed.add(parseRecord(tp, logEntry));
                    bytes += logEntry.size();
                } else {
                    skippedRecords = true;

            recordsCount = parsed.size();
            this.sensors.recordTopicFetchMetrics(tp.topic(), bytes, recordsCount);

            if (!parsed.isEmpty()) {
                log.trace("Adding fetched record for partition {} with offset {} to buffered record list", tp, position);
                parsedRecords = new PartitionRecords<>(fetchOffset, tp, parsed);
                ConsumerRecord<K, V> record = parsed.get(parsed.size() - 1);
                this.sensors.recordsFetchLag.record(partition.highWatermark - record.offset());
            } else if (buffer.limit() > 0 && !skippedRecords) {
                // we did not read a single message from a non-empty buffer
                // because that message's size is larger than fetch size, in this case
                // record this exception
                Map<TopicPartition, Long> recordTooLargePartitions = Collections.singletonMap(tp, fetchOffset);
                throw new RecordTooLargeException("There are some messages at [Partition=Offset]: "
                        + recordTooLargePartitions
                        + " whose size is larger than the fetch size "
                        + this.fetchSize
                        + " and hence cannot be ever returned."
                        + " Increase the fetch size on the client (using max.partition.fetch.bytes),"
                        + " or decrease the maximum message size the broker will allow (using message.max.bytes).",
        } else if (partition.errorCode == Errors.NOT_LEADER_FOR_PARTITION.code()
                || partition.errorCode == Errors.UNKNOWN_TOPIC_OR_PARTITION.code()) {
        } else if (partition.errorCode == Errors.OFFSET_OUT_OF_RANGE.code()) {
            if (fetchOffset != subscriptions.position(tp)) {
                log.debug("Discarding stale fetch response for partition {} since the fetched offset {}" +
                        "does not match the current offset {}", tp, fetchOffset, subscriptions.position(tp));
            } else if (subscriptions.hasDefaultOffsetResetPolicy()) {
                log.info("Fetch offset {} is out of range for partition {}, resetting offset", fetchOffset, tp);
            } else {
                throw new OffsetOutOfRangeException(Collections.singletonMap(tp, fetchOffset));
        } else if (partition.errorCode == Errors.TOPIC_AUTHORIZATION_FAILED.code()) {
            log.warn("Not authorized to read from topic {}.", tp.topic());
            throw new TopicAuthorizationException(Collections.singleton(tp.topic()));
        } else if (partition.errorCode == Errors.UNKNOWN.code()) {
            log.warn("Unknown error fetching data for topic-partition {}", tp);
        } else {
            throw new IllegalStateException("Unexpected error code " + partition.errorCode + " while fetching data");
    } finally {
        completedFetch.metricAggregator.record(tp, bytes, recordsCount);
    return parsedRecords;


private int append(Map<TopicPartition, List<ConsumerRecord<K, V>>> drained,
                   PartitionRecords<K, V> partitionRecords,
                   int maxRecords) {
    if (partitionRecords.isEmpty())
        return 0;

    if (!subscriptions.isAssigned(partitionRecords.partition)) {
        // this can happen when a rebalance happened before fetched records are returned to the consumer's poll call
        // 可能有rebalance
        log.debug("Not returning fetched records for partition {} since it is no longer assigned", partitionRecords.partition);
    } else {
        // note that the consumed position should always be available as long as the partition is still assigned
        long position = subscriptions.position(partitionRecords.partition);
        if (!subscriptions.isFetchable(partitionRecords.partition)) {
            // this can happen when a partition is paused before fetched records are returned to the consumer's poll call
            log.debug("Not returning fetched records for assigned partition {} since it is no longer fetchable", partitionRecords.partition);
        } else if (partitionRecords.fetchOffset == position) {
            // we are ensured to have at least one record since we already checked for emptiness
            List<ConsumerRecord<K, V>> partRecords = partitionRecords.take(maxRecords);
            long nextOffset = partRecords.get(partRecords.size() - 1).offset() + 1;

            log.trace("Returning fetched records at offset {} for assigned partition {} and update " +
                    "position to {}", position, partitionRecords.partition, nextOffset);

            List<ConsumerRecord<K, V>> records = drained.get(partitionRecords.partition);
            if (records == null) {
                records = partRecords;
                drained.put(partitionRecords.partition, records);
            } else {
            subscriptions.position(partitionRecords.partition, nextOffset);
            return partRecords.size();
        } else {
            // these records aren't next in line based on the last consumed position, ignore them
            // they must be from an obsolete request
            log.debug("Ignoring fetched records for {} at offset {} since the current position is {}",
                    partitionRecords.partition, partitionRecords.fetchOffset, position);
    return 0;


 * Update the fetch positions for the provided partitions.
 * @param partitions the partitions to update positions for
 * @throws NoOffsetForPartitionException If no offset is stored for a given partition and no reset policy is available
public void updateFetchPositions(Set<TopicPartition> partitions) {
    // reset the fetch position to the committed position
    for (TopicPartition tp : partitions) {
        if (!subscriptions.isAssigned(tp) || subscriptions.isFetchable(tp))

        // TODO: If there are several offsets to reset, we could submit offset requests in parallel
        if (subscriptions.isOffsetResetNeeded(tp)) {
        } else if (subscriptions.committed(tp) == null) {
            // there's no committed position, so we need to reset with the default strategy
        } else {
            long committed = subscriptions.committed(tp).offset();
            log.debug("Resetting offset for partition {} to the committed offset {}", tp, committed);
            subscriptions.seek(tp, committed);

private void resetOffset(TopicPartition partition) {
    OffsetResetStrategy strategy = subscriptions.resetStrategy(partition);
    final long timestamp;
    if (strategy == OffsetResetStrategy.EARLIEST)
        timestamp = ListOffsetRequest.EARLIEST_TIMESTAMP; //-2
    else if (strategy == OffsetResetStrategy.LATEST)
        timestamp = ListOffsetRequest.LATEST_TIMESTAMP;//-1
        throw new NoOffsetForPartitionException(partition);

    log.debug("Resetting offset for partition {} to {} offset.", partition, strategy.name().toLowerCase(Locale.ROOT));
    long offset = listOffset(partition, timestamp);

    // we might lose the assignment while fetching the offset, so check it is still active
    if (subscriptions.isAssigned(partition))
        this.subscriptions.seek(partition, offset);






