SparkStreaming简易实现Kafka偏移量保存

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaCluster.Err
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaCluster, KafkaUtils, OffsetRange}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}


object WordCount3 {
  val topics = Set("wordcount")
  val groupId = "wc"

  private val params: Map[String, String] = Map[String, String](
    "bootstrap.servers" -> "node-01:9092,node-02:9092,node-03:9092",
    "groug.id" -> groupId
  )
  //KafkaCluster,可以完成保存与读offset
  private val cluster = new KafkaCluster(params)

  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount1")
    val sc = new SparkContext(conf)
    sc.setLogLevel("ERROR")
    val ssc = new StreamingContext(sc, Seconds(3))

    val sourceStream: InputDStream[String] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, String](
      ssc,
      params,
      readOffsets,
      (handler: MessageAndMetadata[String, String]) => handler.message()
    )
    sourceStream.flatMap(_.split("\\W+")).map((_, 1)).reduceByKey(_ + _).print(1000)

    //保存偏移量
    saveOffsets(sourceStream)
    ssc.start()
    ssc.awaitTermination()
  }

  /**
    * 从kafka中读取偏移量
    *
    * @return
    */
  def readOffsets = {
    var resultMap = Map[TopicAndPartition, Long]()
    //1、获取这些topic的所有分区
    val topicAndPartitionSetEither: Either[Err, Set[TopicAndPartition]] = cluster.getPartitions(topics)
    topicAndPartitionSetEither match {
      //2、获取topics和分区的信息
      case Right(topicAndPartitionSet) => {
        //3、获取到分区信息和offset
        val topicAndPartitionToLongEither: Either[Err, Map[TopicAndPartition, Long]] = cluster.getConsumerOffsets(groupId, topicAndPartitionSet)
        topicAndPartitionToLongEither match {
          //表示每个topic的每个分区都已经存储过偏移量
          case Right(topicAndPartitionMap) => {
            resultMap ++= topicAndPartitionMap
          }
          //表示这个topic的分区是第一次消费
          case _ => {
            topicAndPartitionSet.foreach(topicAndPartition => {
              resultMap += (topicAndPartition -> 0L)
            })
          }
        }
      }
      case _ => //表示不存在任何topic
    }
    resultMap
  }

  /**
    * 保存偏移量
    *
    * @param sourceStream
    */
  def saveOffsets(sourceStream: InputDStream[String]) = {
    //每个批次执行一次传递过去的函数
    sourceStream.foreachRDD(rdd => {
      var map: Map[TopicAndPartition, Long] = Map[TopicAndPartition, Long]()
      //包含了offset的信息
      val offsetRanges: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]
      val ranges: Array[OffsetRange] = offsetRanges.offsetRanges
      ranges.foreach(offsetRange => {
        val offset: Long = offsetRange.untilOffset
        map += (offsetRange.topicAndPartition() -> offset)
      })
      cluster.setConsumerOffsets(groupId, map)
    })
  }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值