spark-streaming的direct方式保存offset到zookeeper

最新推荐文章于 2024-04-26 15:25:50 发布

宁桑榆

最新推荐文章于 2024-04-26 15:25:50 发布

阅读量211

点赞数

分类专栏： spark spark-streaming 文章标签： spark-streaming spark direct offset zookeeper

本文链接：https://blog.csdn.net/weixin_37589026/article/details/102454553

版权

spark 同时被 2 个专栏收录

2 篇文章 0 订阅

订阅专栏

spark-streaming

2 篇文章 0 订阅

订阅专栏

maven依赖

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
            <version>2.1.1</version>
        </dependency>

2.直接上代码

object BIStreamingZK {
    
    def main(args: Array[String]): Unit = {
    logger.setLevel(Level.WARN)

    val sparkConf=new SparkConf().setAppName("bi_stream_analyse_zk")
    //开启背压机制
    sparkConf.set("spark.streaming.backpressure.enabled","true")

    //max=partiions*5*100=1500
    sparkConf.set("spark.streaming.kafka.maxRatePerPartition","100")
    val ssc = new StreamingContext(sparkConf,Seconds(5))

    //topics
    val topics = Set("topic1","topic2")
    //groupid
    val groupId="bi_stream_analyse_bill_state_zk"

    val kfkParams = Map[String, String](
      "zookeeper.connect"->"setver1:21810,server2:21810,server3:21810",
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "kafkaserver1:9092,kafkaserver2:9092,kafkaserver3:9092",
      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "largest",
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG->"org.apache.kafka.common.serialization.StringSerializer",
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG->"org.apache.kafka.common.serialization.StringSerializer",
      ConsumerConfig.GROUP_ID_CONFIG -> groupId
    )

    //创建KafkaCluster(维护offset)
    val kafkaCluster = new KafkaCluster(kfkParams)

    //获取offset
    val fromOffset:Map[TopicAndPartition,Long]=getOffset(kafkaCluster,topics,groupId)

    //创建DStream
    val kafkaDStream: InputDStream[String] = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder,String](ssc,
      kfkParams,
      fromOffset,
      (message: MessageAndMetadata[String, String]) => message.message())


    //处理业务
    kafkaDStream.foreachRDD( rdd => {
      if (!rdd.isEmpty()) {
        dealRdd(rdd)
      }
    })


    //保存Offset
    setOffset(kafkaCluster,kafkaDStream,groupId)


    ssc.start()
    ssc.awaitTermination()

  }


def getOffset(kafkaCluster: KafkaCluster, topics: Set[String], groupId: String): Map[TopicAndPartition, Long] = {
    var partitionToLong = new HashMap[TopicAndPartition,Long]

    //获取每个分区
    val topicAndPartionsEither: Either[Err, Set[TopicAndPartition]] = kafkaCluster.getPartitions(topics)

    //topic存在
    if (topicAndPartionsEither.isRight) {

      val topicAndPartions: Set[TopicAndPartition] = topicAndPartionsEither.right.get


      val topicAndPartionToLongEither: Either[Err, Map[TopicAndPartition, Long]] = kafkaCluster.getConsumerOffsets(groupId,topicAndPartions)

      if (topicAndPartionToLongEither.isLeft) {
        //offset从未消费:此处offset置为0，但是kafkaoffset最小过期大于0就会异常,后面处理
        for(topicAndPartion<-topicAndPartions){
          partitionToLong += (topicAndPartion -> 0L)
        }

      }else{
        //offset消费过,超过kafka七天保存会异常
        val value: Map[TopicAndPartition, Long] = topicAndPartionToLongEither.right.get
        partitionToLong++=value
      }

    }

    partitionToLong
  }

  def setOffset(kafkaCluster: KafkaCluster, kafkaDstream: InputDStream[String], groupId: String): Unit = {

    kafkaDstream.foreachRDD(rdd=>{

      var partitionToLong = new HashMap[TopicAndPartition,Long]

      val offSetranges: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]

      val ranges: Array[OffsetRange] = offSetranges.offsetRanges

      for(range<-ranges){
        partitionToLong+=(range.topicAndPartition()->range.untilOffset)
      }

      kafkaCluster.setConsumerOffsets(groupId,partitionToLong)

    })

  }



}

宁桑榆

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
spark-streaming的direct方式保存offset到zookeeper

maven依赖 <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <scope>provided</s...
复制链接

扫一扫