sparkstream和kafka的低级消费者(保存在ZK和Mysql两种)

最新推荐文章于 2022-02-08 15:20:34 发布

qq_42506914

最新推荐文章于 2022-02-08 15:20:34 发布

阅读量219

点赞数

分类专栏： spark

本文链接：https://blog.csdn.net/qq_42506914/article/details/88649723

版权

spark 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

写了两套方法，一套保存在zk上，一套保存在mysql里面

/**
  * @author wade 
  * @create 2019-03-14 9:16 
  */
object LowerKafkaSource {



  def fromOffsets(kafkaCluster: KafkaCluster, groupId: String, topic: String): Map[TopicAndPartition, Long] = {

    var partitionToLong: Map[TopicAndPartition, Long] = Map[TopicAndPartition, Long]()
    //从集群中获取主题的所有分区
    val topicAndPartitionEither: Either[Err, Set[TopicAndPartition]] = kafkaCluster.getPartitions(Set(topic))

    if (topicAndPartitionEither.isRight) {
      val partitions: Set[TopicAndPartition] = topicAndPartitionEither.right.get

      //获取分区 对应组的offset
      val topicAndPartitionToLongEither: Either[Err, Map[TopicAndPartition, Long]] = kafkaCluster.getConsumerOffsets(groupId,partitions)

      val earlist: Either[Err, Map[TopicAndPartition, KafkaCluster.LeaderOffset]] = kafkaCluster.getEarliestLeaderOffsets(partitions)
      val earlistTopicAndPartitionTolong: Map[TopicAndPartition, Long] = earlist.right.get.map( t => (t._1,t._2.offset))

      if (topicAndPartitionToLongEither.isRight) {

        val topicAndPartitionToLong: Map[TopicAndPartition, Long] = topicAndPartitionToLongEither.right.get
        //如果存在
        //用  ++=  加完以后把新的返回赋值给了 var
        //如果该组分区对应的offset比数据最小的offset要小（kafka清理过期数据）
        //应该比一下
        //融合一下  取大的
        println("topicAndPartitionToLong"+topicAndPartitionToLong)
        println("earlistTopicAndPartitionTolong"+earlistTopicAndPartitionTolong)

        var partitionToLongFinal: Map[TopicAndPartition, Long] =Map()

        for (current <- topicAndPartitionToLong) {
          for (earlist <- earlistTopicAndPartitionTolong) {
              if(current._1 == earlist._1) {
                partitionToLongFinal += (current._1 -> math.max(earlist._2,current._2))
              }

          }
        }

        println("现在的offset"+partitionToLongFinal)
        println("最小的offset"+earlistTopicAndPartitionTolong)
        partitionToLong ++= partitionToLongFinal

      }else{
        //如果不存在，那么获取最小的offset给它//或者给最新的也行
        println(earlistTopicAndPartitionTolong)
        println("如果不存在，那么获取最小的offset给它//或者给最新的也行")
              partitionToLong ++= earlistTopicAndPartitionTolong

//        for (topicAndPartition <- partitions) {
//      OffsetOutOfRangeException  异常  不能直接给0L
//
//          /*
//          这个消费者组如果没有offset保存过，从头消费，应该给当前最小offset
//          因为数据由过期的时间，从0开始，可能0后面的一些数据已经过期了
//           */
//          partitionToLong += (topicAndPartition -> 0L)
//        }

      }

    }else{
      throw new RuntimeException("分区不存在")
    }
    partitionToLong

  }



  def setOffset(kafkaCluster: KafkaCluster, groupId: String, kafkaDirectStream: InputDStream[String]) = {


    kafkaDirectStream.foreachRDD(rdd => {
      //这些要放在里面！！！！！
      var offsets: Map[TopicAndPartition, Long] = Map()

      val ranges: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]
      val offsetRanges: Array[OffsetRange] = ranges.offsetRanges

      for (elem <- offsetRanges) {
        offsets += (elem.topicAndPartition() -> elem.untilOffset)
      }
      //这些要放在里面！！！！！
      kafkaCluster.setConsumerOffsets(groupId,offsets)

    })






  }

  def main(args: Array[String]): Unit = {
          val conf = new SparkConf().setMaster("local[*]").setAppName("lower")
          val ssc = new StreamingContext(conf,Seconds(3))
       val topic = "my"
        val groupId = "big10151"
        val brokers = "hadoop103:9092,hadoop104:9092,hadoop105:9092"
        val deserializer = "org.apache.kafka.common.serialization.StringDeserializer"

    val kafkaParams = Map(
      //"zookeeper.connect" -> "hadoop103:2181,hadoop104:2181,hadoop105:2181",
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
      ConsumerConfig.GROUP_ID_CONFIG -> groupId,
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> deserializer,
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> deserializer
    )

   val kafkaCluster = new  KafkaCluster (kafkaParams)
    //第五个泛型  指的是最后一个函数 的返回值   message.message()
    val kafkaDirectStream = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder,String](
      ssc,
      kafkaParams,
      //fromOffsets(kafkaCluster, groupId, topic),
      fromOffsetsMysql(kafkaCluster, groupId, topic),
      //这个函数是处理器  可以返回 K V 都可以
     ( message:MessageAndMetadata[String,String]) => message.message()
    )

    kafkaDirectStream.print()

    //设置最新的 offset

    //setOffset(kafkaCluster,groupId,kafkaDirectStream)
      setOffsetMysql(kafkaCluster,groupId,kafkaDirectStream)
    ssc.start()

    ssc.awaitTermination()

  }

  def fromOffsetsMysql(kafkaCluster: KafkaCluster, groupId: String, topic: String): Map[TopicAndPartition, Long] = {
    //1、通过kc查出所有  主题和区信息
    val topicAndPartitionsEither: Either[Err, Set[TopicAndPartition]] = kafkaCluster.getPartitions(Set(topic))
    var topicAndParitionsToLong = Map[TopicAndPartition, Long]()
    if (topicAndPartitionsEither.isRight) {


      //2、mysql中的主键   topic + group + partition
      //mysql中建表   tbl_offset    row    topic_group_partition    offset
      val topicAndPartitions: Set[TopicAndPartition] = topicAndPartitionsEither.right.get
      val offsetKeys = topicAndPartitions.map(tap => (tap.topic+"_"+groupId+"_"+tap.partition,tap))

      /*
      先从kafka集群查出区内所有的最早的offset
      1、如果从mysql里面没有查询到offset，那么默认选用这个offset
      2、如果从mysql里面查询出来的offset比这个小 ，那么选用这个，不然会出异常
       */
      val topicAndPartitionEither: Either[Err, Map[TopicAndPartition, KafkaCluster.LeaderOffset]] = kafkaCluster.getEarliestLeaderOffsets(topicAndPartitions)
      val earlistTopicAndPartitions: Map[TopicAndPartition, Long] = topicAndPartitionEither.right.get.map(t =>(t._1,t._2.offset))
      val conn: Connection = C3p0Utils.getConnection
      offsetKeys.foreach(key =>{
        //查询
        var sql = "SELECT offset_info FROM tbl_offset WHERE topic_group_partition =  ? "
        val ps: PreparedStatement = conn.prepareStatement(sql)
        ps.setString(1,key._1)
        val rs: ResultSet = ps.executeQuery()
        if (rs.next()) {
          val offset: Int = rs.getInt(1)
          //判断   mysql查出来的和最早offset来对比
          val earlistOffset = earlistTopicAndPartitions(key._2)

          if(offset < earlistOffset){
            topicAndParitionsToLong += (key._2 -> earlistOffset)
          }else{
            topicAndParitionsToLong += (key._2 -> offset)
          }

        }else{
          //如果没有查到，把最早的放到返回的集合
          topicAndParitionsToLong += (key._2 -> earlistTopicAndPartitions(key._2))
        }
        rs.close()
        ps.close()
      })

      conn.close()


    }

    topicAndParitionsToLong
  }

  def setOffsetMysql(kafkaCluster: KafkaCluster, groupId: String, kafkaDirectStream: InputDStream[String]) = {
    kafkaDirectStream.foreachRDD(rdd => {
      val offsetRanges: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]
      val ranges: Array[OffsetRange] = offsetRanges.offsetRanges

      val conn: Connection = C3p0Utils.getConnection
      for (range <- ranges) {
        val tap: TopicAndPartition = range.topicAndPartition()
        var key = tap.topic+"_"+groupId+"_"+tap.partition
        var sql  =  "INSERT INTO tbl_offset VALUES (?,?) ON DUPLICATE KEY UPDATE offset_info = ? "
        val ps: PreparedStatement = conn.prepareStatement(sql)
        ps.setString(1,key)
        ps.setInt(2,range.untilOffset.toInt)
        ps.setInt(3,range.untilOffset.toInt)
        ps.executeUpdate()
        ps.close()
      }

      conn.close()
    })




  }
}

qq_42506914

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
sparkstream和kafka的低级消费者(保存在ZK和Mysql两种)

写了两套方法，一套保存在zk上，一套保存在mysql里面/** * @author wade * @create 2019-03-14 9:16 */object LowerKafkaSource { def fromOffsets(kafkaCluster: KafkaCluster, groupId: String, topic: String): Map[T...
复制链接

扫一扫

专栏目录