SparkStreaming数据源之Kafka

导入依赖:

<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
    <version>2.1.1</version>
</dependency>
  • 导入依赖后,包内提供得KafkaUtils对象可以在 StreamingContext(scala中用) 和 JavaStreamingContrxt(java中用)中以你得kafka消息创建出DStream
  •  两个核心类:KafkaUtils、KafkaCluster

kafka一般消费形式:

import kafka.serializer.StringDecoder
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

object test11 {
    def main(args: Array[String]): Unit = {

        val sparkConf: SparkConf = new SparkConf().setAppName("hah").setMaster("local[*]")
        val ssc = new StreamingContext(sparkConf,Seconds(3))

        // kafka 参数
        val brokers = "hadoop001:9092,hadoop002:9092,hadoop003:9092"
        val topic = "atguigu"
        val group = "bigdata"
        val deserialization = "org.apache.kafka.common.serialization.StringDeserializer"

        val kafkaParams  = Map(
            ConsumerConfig.GROUP_ID_CONFIG -> group,
            ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
            ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> deserialization,
            ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> deserialization
        )

        val dStream: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
            ssc, kafkaParams, Set(topic)
        )
        dStream.print()
        // 开启采集器
        ssc.start()
        // 停止集群,等采集器停止后在停止
        ssc.awaitTermination()


    }
}

kafka低级API:手动维护偏移量

package com.atguigu.bigdata.spark.streaming

import java.net.URI

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaCluster, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.kafka.KafkaCluster.Err
import org.apache.spark.streaming.{Seconds, StreamingContext, StreamingContextState}

object SparkStreaming10_Kafka_LowAPI {

  def main(args: Array[String]): Unit = {


    // 监听指定端口,获取数据,实现WordCount功能

    val sparkConf = new SparkConf().setAppName("SparkStreaming10_Kafka_LowAPI").setMaster("local[2]")

    // 配置优雅的关闭
    sparkConf.set("spark.streaming.stopGracefullyOnShutdown", "true")

    // TODO 创建上下文环境对象
    val ssc = new StreamingContext(sparkConf, Seconds(3))

    // 自己维护数据消费的偏移量

    // group => topic => partition => offset


    // kafka 参数
    //kafka参数声明
    val brokers = "linux1:9092,linux2:9092,linux3:9092"
    val topic = "atguigu190513"
    val group = "bigdata"
    val deserialization = "org.apache.kafka.common.serialization.StringDeserializer"
    val kafkaParams = Map(
      "zookeeper.connect" -> "linux1:2181,linux2:2181,linux3:2181",
      ConsumerConfig.GROUP_ID_CONFIG -> group,
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> deserialization,
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> deserialization
    )
    // 获取Kafka集群对象
    val kafkaCluster = new KafkaCluster(kafkaParams)

    var topicAndPartition2Offset: Map[TopicAndPartition, Long] = Map[TopicAndPartition, Long]()

    // 获取主题的分区信息
    val topicMetadataEither: Either[Err, Set[TopicAndPartition]] = kafkaCluster.getPartitions(Set(topic))

    if (topicMetadataEither.isRight) {
      // 主题存在分区信息
      val topicAndPartitions: Set[TopicAndPartition] = topicMetadataEither.right.get

      // 返回消费者组在指定分区的偏移量
      val topicAndPartition2OffsetEither: Either[Err, Map[TopicAndPartition, Long]] =
        kafkaCluster.getConsumerOffsets(group, topicAndPartitions)

      if (topicAndPartition2OffsetEither.isLeft) {
        // 消费者组从来就没有消费数据
        topicAndPartitions.foreach {
          topicAndPartition => topicAndPartition2Offset = topicAndPartition2Offset + (topicAndPartition -> 0)
        }
      } else {
        val current: Map[TopicAndPartition, Long] = topicAndPartition2OffsetEither.right.get
        topicAndPartition2Offset ++= current
      }
    }

    val dStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, String](
      ssc,
      kafkaParams,
      topicAndPartition2Offset,
      (message: MessageAndMetadata[String, String]) => message.message()
    )

    dStream.print()

    dStream.foreachRDD(
      rdd=>{
        var map: Map[TopicAndPartition, Long] = Map[TopicAndPartition, Long]()

        // 将RDD转换为Kafka对应的RDD,
        val hasOffsetRangs: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]

        // 获取偏移量信息
        val ranges: Array[OffsetRange] = hasOffsetRangs.offsetRanges
        // 循环遍历每一个分区信息,更新偏移量
        ranges.foreach(range => {
          // 每个分区的最新的 offset
          map += range.topicAndPartition() -> range.untilOffset
        })
        // 向kafka集群更新偏移量
        kafkaCluster.setConsumerOffsets(group,map )
      }
    )



    // TODO 让采集器启动执行
    ssc.start()

    // TODO Driver等待采集器的执行完毕
    ssc.awaitTermination()


  }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值