Kafka和Storm的整合

最新推荐文章于 2021-03-23 17:44:20 发布

define_us

最新推荐文章于 2021-03-23 17:44:20 发布

阅读量1.8k

点赞数

分类专栏： java

本文链接：https://blog.csdn.net/define_us/article/details/82218596

版权

java 专栏收录该内容

221 篇文章 0 订阅

订阅专栏

主要难点在于实现一个KafkaSpout，用于Storm接收从Kafka传来的消息

获取数据

首先是从kafka中获取数据，使用Kafka提供的API KafkaConsumer类

	//设置transient防止nimbus将这个字段序列化给supervisor
    protected transient KafkaConsumer<K, V> kafkaConsumer;
    
    //Storm Spout的API，在spout初始化时执行
    @Override
    public void activate() {
        try {
            subscribeKafkaConsumer();
        } catch (InterruptException e) {
            throwKafkaConsumerInterruptedException();
        }
    }

    private void subscribeKafkaConsumer() {
	    //初始化Consumer
        kafkaConsumer = kafkaConsumerFactory.createConsumer(kafkaSpoutConfig);
		//内部调用kafka API consumer.subscribe(pattern, listener);
		//其中，pattern是订阅topic的条件，listener用于处理被分配和被取消partition拥有权时的做法
        kafkaSpoutConfig.getSubscription().subscribe(kafkaConsumer, new KafkaSpoutConsumerRebalanceListener(), context);
    }
    
    //Storm Spout的API，用于产生spout数据
    @Override
    public void nextTuple() {
        try {
            if (refreshSubscriptionTimer.isExpiredResetOnTrue()) {
                kafkaSpoutConfig.getSubscription().refreshAssignment();
            }
			//使用timer，定期commit offset。
            if (commitTimer != null && commitTimer.isExpiredResetOnTrue()) {
                if (isAtLeastOnceProcessing()) {//如果采用至少一次处理策略，则只对ACK的消息commit offset
                    commitOffsetsForAckedTuples(kafkaConsumer.assignment());
                } else if (kafkaSpoutConfig.getProcessingGuarantee() == ProcessingGuarantee.NO_GUARANTEE) {//否则commit所有conffset
                    Map<TopicPartition, OffsetAndMetadata> offsetsToCommit =
                        createFetchedOffsetsMetadata(kafkaConsumer.assignment());
                    kafkaConsumer.commitAsync(offsetsToCommit, null);
                    LOG.debug("Committed offsets {} to Kafka", offsetsToCommit);
                }
            }

            PollablePartitionsInfo pollablePartitionsInfo = getPollablePartitionsInfo();
            if (pollablePartitionsInfo.shouldPoll()) {
                try {
                    setWaitingToEmit(pollKafkaBroker(pollablePartitionsInfo));//从kafka上拉去消息，并加入到给bolt的发送队列
                } catch (RetriableException e) {
                    LOG.error("Failed to poll from kafka.", e);
                }
            }
			//向下游bolt发送发送队列中的消息
            emitIfWaitingNotEmitted();
        } catch (InterruptException e) {
            throwKafkaConsumerInterruptedException();
        }
    }

    //实际拉取过程
    protected ConsumerRecords<K, V> pollKafkaBroker(PollablePartitionsInfo pollablePartitionsInfo) {
        doSeekRetriableTopicPartitions(pollablePartitionsInfo.pollableEarliestRetriableOffsets);
        //获取到当前consumner被分配的所有分区
        Set<TopicPartition> pausedPartitions = new HashSet<>(kafkaConsumer.assignment());
        Iterator<TopicPartition> pausedIter = pausedPartitions.iterator();
        //过滤掉那些不准备的拉取的分区
        while (pausedIter.hasNext()) {
            if (pollablePartitionsInfo.pollablePartitions.contains(pausedIter.next())) {
                pausedIter.remove();
            }
        }
        try {
            //暂停不准的拉取的分区的拉取
            kafkaConsumer.pause(pausedPartitions);
            //实际获取
            final ConsumerRecords<K, V> consumerRecords = kafkaConsumer.poll(kafkaSpoutConfig.getPollTimeoutMs());
            ackRetriableOffsetsIfCompactedAway(pollablePartitionsInfo.pollableEarliestRetriableOffsets, consumerRecords);
            final int numPolledRecords = consumerRecords.count();
            LOG.debug("Polled [{}] records from Kafka",
                numPolledRecords);
            if (kafkaSpoutConfig.getProcessingGuarantee() == KafkaSpoutConfig.ProcessingGuarantee.AT_MOST_ONCE) {
                //Commit polled records immediately to ensure delivery is at-most-once.
                Map<TopicPartition, OffsetAndMetadata> offsetsToCommit =
                    createFetchedOffsetsMetadata(kafkaConsumer.assignment());
                kafkaConsumer.commitSync(offsetsToCommit);
                LOG.debug("Committed offsets {} to Kafka", offsetsToCommit);
            }
            return consumerRecords;
        } finally {
            kafkaConsumer.resume(pausedPartitions);

发送数据

我们从kafka中获取到了数据，下面该发送了

    //发送第一步，加入需要发送列表中
    protected void setWaitingToEmit(ConsumerRecords<K, V> consumerRecords) {
        for (TopicPartition tp : consumerRecords.partitions()) {
            waitingToEmit.put(tp, new ArrayList<>(consumerRecords.records(tp)));
        }
    }

    protected void emitIfWaitingNotEmitted() {
        //从waitingToEmit列表获取将要发送的事件，开始发送
        Iterator<List<ConsumerRecord<K, V>>> waitingToEmitIter = waitingToEmit.values().iterator();
        LOG.info("real event emit begin");
        outerLoop:
        while (waitingToEmitIter.hasNext()) {
            List<ConsumerRecord<K, V>> waitingToEmitForTp = waitingToEmitIter.next();
            while (!waitingToEmitForTp.isEmpty()) {
                final boolean emittedTuple = emitOrRetryTuple(waitingToEmitForTp.remove(0));
                if (emittedTuple) {
                    LOG.error("event emit has failed");
                    break outerLoop;
                }
            }
            waitingToEmitIter.remove();
        }
    }

	//实际发送过程，同样是调用collector.emit
    private boolean emitOrRetryTuple(ConsumerRecord<K, V> record) {
        final TopicPartition tp = new TopicPartition(record.topic(), record.partition());
        final KafkaSpoutMessageId msgId = retryService.getMessageId(record);

        if (offsetManagers.containsKey(tp) && offsetManagers.get(tp).contains(msgId)) {   // has been acked
            LOG.trace("Tuple for record [{}] has already been acked. Skipping", record);
        } else if (emitted.contains(msgId)) {   // has been emitted and it is pending ack or fail
            LOG.trace("Tuple for record [{}] has already been emitted. Skipping", record);
        } else {
            final OffsetAndMetadata committedOffset = kafkaConsumer.committed(tp);
            if (isAtLeastOnceProcessing()
                && committedOffset != null 
                && committedOffset.offset() > record.offset()
                && commitMetadataManager.isOffsetCommittedByThisTopology(tp, committedOffset, Collections.unmodifiableMap(offsetManagers))) {
                // Ensures that after a topology with this id is started, the consumer fetch
                // position never falls behind the committed offset (STORM-2844)
                throw new IllegalStateException("Attempting to emit a message that has already been committed."
                    + " This should never occur when using the at-least-once processing guarantee.");
            }

            final List<Object> tuple = kafkaSpoutConfig.getTranslator().apply(record);
            if (isEmitTuple(tuple)) {
                final boolean isScheduled = retryService.isScheduled(msgId);
                // not scheduled <=> never failed (i.e. never emitted), or scheduled and ready to be retried
                if (!isScheduled || retryService.isReady(msgId)) {
                    final String stream = tuple instanceof KafkaTuple ? ((KafkaTuple) tuple).getStream() : Utils.DEFAULT_STREAM_ID;
					
                    if (!isAtLeastOnceProcessing()) {
                        if (kafkaSpoutConfig.isTupleTrackingEnforced()) {
                            collector.emit(stream, tuple, msgId);
                            LOG.trace("Emitted tuple [{}] for record [{}] with msgId [{}]", tuple, record, msgId);
                        } else {
                            collector.emit(stream, tuple);
                            LOG.trace("Emitted tuple [{}] for record [{}]", tuple, record);
                        }
                    } else {
                        //加入emit数组
                        emitted.add(msgId);
                        offsetManagers.get(tp).addToEmitMsgs(msgId.offset());
                        if (isScheduled) {  // Was scheduled for retry and re-emitted, so remove from schedule.
                            retryService.remove(msgId);
                        }
                        collector.emit(stream, tuple, msgId);
                        tupleListener.onEmit(tuple, msgId);
                        LOG.trace("Emitted tuple [{}] for record [{}] with msgId [{}]", tuple, record, msgId);
                    }
                    return true;
                }
            } else {
                /*if a null tuple is not configured to be emitted, it should be marked as emitted and acked immediately
                * to allow its offset to be commited to Kafka*/
                LOG.debug("Not emitting null tuple for record [{}] as defined in configuration.", record);
                if (isAtLeastOnceProcessing()) {
                    msgId.setNullTuple(true);
                    offsetManagers.get(tp).addToEmitMsgs(msgId.offset());
                    ack(msgId);
                }
            }
        }
        return false;
    }

确认数据

认证过程就是从emmited队列中移除这个msgID。JStorm的线程模型里，如果不指定，ack和nexttuple在不同线程里。这里可能对kafka造成问题。所以必须指定spout的单线程模式

conf.put("spout.single.thread", true);

    @Override
    public void ack(Object messageId) {
        if (!isAtLeastOnceProcessing()) {
            return;
        }

        // Only need to keep track of acked tuples if commits to Kafka are controlled by
        // tuple acks, which happens only for at-least-once processing semantics
        final KafkaSpoutMessageId msgId = (KafkaSpoutMessageId) messageId;

        if (msgId.isNullTuple()) {
            //a null tuple should be added to the ack list since by definition is a direct ack
            offsetManagers.get(msgId.getTopicPartition()).addToAckMsgs(msgId);
            LOG.debug("Received direct ack for message [{}], associated with null tuple", msgId);
            tupleListener.onAck(msgId);
            return;
        }

        if (!emitted.contains(msgId)) {
            LOG.debug("Received ack for message [{}], associated with tuple emitted for a ConsumerRecord that "
                        + "came from a topic-partition that this consumer group instance is no longer tracking "
                        + "due to rebalance/partition reassignment. No action taken.", msgId);
        } else {
            Validate.isTrue(!retryService.isScheduled(msgId), "The message id " + msgId + " is queued for retry while being acked."
                        + " This should never occur barring errors in the RetryService implementation or the spout code.");
            offsetManagers.get(msgId.getTopicPartition()).addToAckMsgs(msgId);
            emitted.remove(msgId);
        }
        tupleListener.onAck(msgId);
    }

失败则进行重发

define_us

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
Kafka和Storm的整合

主要难点在于实现一个KafkaSpout，用于Storm接收从Kafka传来的消息 //发送第一步，加入需要发送列表中 protected void setWaitingToEmit(ConsumerRecords&amp;amp;amp;amp;amp;lt;K, V&amp;amp;amp;amp;amp;gt; consumerRecords) { for (TopicPartition tp : consumerRecords.par...
复制链接

扫一扫

专栏目录