sparkStreaming 实现kafka offset自定义保存

12 篇文章 1 订阅
6 篇文章 0 订阅

KafkaUtils.createDirectStream

区别Receiver接收数据,这种方式定期地从kafka的topic+partition中查询最新的偏移量,再根据偏移量范围在每个batch里面处理数据,使用的是kafka的简单消费者api 
优点: 
A、 简化并行,不需要多个kafka输入流,该方法将会创建和kafka分区一样的rdd个数,而且会从kafka并行读取。 
B、高效,这种方式并不需要WAL,WAL模式需要对数据复制两次,第一次是被kafka复制,另一次是写到wal中 
C、恰好一次语义(Exactly-once-semantics),传统的读取kafka数据是通过kafka高层次api把偏移量写入zookeeper中,存在数据丢失的可能性是zookeeper中和ssc的偏移量不一致。EOS通过实现kafka低层次api,偏移量仅仅被ssc保存在checkpoint中,消除了zk和ssc偏移量不一致的问题。缺点是无法使用基于zookeeper的kafka监控工具

offset保存与Redis中,首先需要RedisUtil的编写:

import org.apache.commons.pool2.impl.GenericObjectPoolConfig
import redis.clients.jedis.JedisPool

/**
 * @author https://blog.csdn.net/qq_38704184
 * @package pool
 * @date 2019/11/25 10:09
 * @version 1.0
 */
object InternalRedisClient extends Serializable {
  @transient private var pool: JedisPool = null

  def makePool(redisHost: String, redisPort: Int, redisTimeout: Int,
               maxTotal: Int, maxIdle: Int, minIdle: Int): Unit = {
    makePool(redisHost, redisPort, redisTimeout, maxTotal, maxIdle, minIdle, true, false, 10000)
  }

  def makePool(redisHost: String, redisPort: Int, redisTimeout: Int,
               maxTotal: Int, maxIdle: Int, minIdle: Int, testOnBorrow: Boolean,
               testOnReturn: Boolean, maxWaitMillis: Long): Unit = {
    if (pool == null) {
      val poolConfig = new GenericObjectPoolConfig()
      poolConfig.setMaxTotal(maxTotal)
      poolConfig.setMaxIdle(maxIdle)
      poolConfig.setMinIdle(minIdle)
      poolConfig.setTestOnBorrow(testOnBorrow)
      poolConfig.setTestOnReturn(testOnReturn)
      poolConfig.setMaxWaitMillis(maxWaitMillis)
      pool = new JedisPool(poolConfig, redisHost, redisPort, redisTimeout)

      val hook = new Thread {
        override def run = pool.destroy()
      }
      sys.addShutdownHook(hook.run)
    }
  }

  def getPool: JedisPool = {
    assert(pool != null)
    pool
  }
}

初始化Redis Pool:

def initRedisPool = {
    //    redis configuration
    val maxTotal = 20
    val maxIdle = 10
    val minIdle = 1
    val redisHost = "127.0.0.1"
    val redisPort = 6379
    val redisTimeout = 30000
    InternalRedisClient.makePool(redisHost, redisPort, redisTimeout, maxTotal, maxIdle, minIdle)
  }

获取上次提交的offset:

def getLastCommittedOffsets(groupId: String, topicName: String, partitions: Int): Map[TopicPartition, Long] = {
    if (LOG.isInfoEnabled())
      LOG.info("||--Topic:{},getLastCommittedOffsets from Redis--||", topicName)

    //      redis 获取上一次存的offset
    val jedis: Jedis = InternalRedisClient.getPool.getResource
    val fromOffsets = collection.mutable.HashMap.empty[TopicPartition, Long]
    for (partition <- 0 to (partitions - 1)) {
      val groupId_topic_partition_key = groupId + "_" + topicName + "_" + partition
      val lastSaveOffset: String = jedis.get(groupId_topic_partition_key)
      val lastOffset: Long = if (lastSaveOffset == null) 0L else lastSaveOffset.toLong
      fromOffsets += (new TopicPartition(topicName, partition) -> lastOffset)
    }
    jedis.close()
    fromOffsets.toMap
  }

开启streaming处理:

def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    //    初始化redis pool
    initRedisPool

    val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
    val ssc = new StreamingContext(conf, Seconds(60))

    val topic: String = "mysql_store_offset"
    val group: String = "mysql_offset"
    //    The maximum number of records returned in a single call to poll
    val maxPoll = 2000

    val kafkaParams = Map(
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "cm01:9092,cm02:9092,cm03:9092",
      ConsumerConfig.GROUP_ID_CONFIG -> group,
      ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false",
      ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString,
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer]
    )
    //    这里指定topic的partition的总数
    val topicPartitionToLong: Map[TopicPartition, Long] = getLastCommittedOffsets(group, topic, 3)
    //    初始化kafkaDS
    val kafkaTopicDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Assign[String, String](topicPartitionToLong.keys.toList, kafkaParams, topicPartitionToLong)
    )

    kafkaTopicDS.foreachRDD(rdd => {
      val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      if (!rdd.isEmpty()) {
        val jedis: Jedis = InternalRedisClient.getPool.getResource
        val p: Pipeline = jedis.pipelined()
        //        开启事务
        p.multi()

        //        处理数据
        rdd.foreach(partition => {
          println(s"${partition.topic()},${partition.partition()},${partition.offset()},${partition.value()}")
        })
        //        保存每次拉取后的offset
        offsetRanges.foreach(offsetRange => {
          println(s"partition:${offsetRange.partition},fromOffset:${offsetRange.fromOffset},utilOffset:${offsetRange.untilOffset}")
          val groupID_topic_partition_key = s"${group}_${offsetRange.topic}_${offsetRange.partition}_${offsetRange.partition}"
          p.set(groupID_topic_partition_key, offsetRange.untilOffset + "")
        })
        //        提交事务
        p.exec()
        //        关闭pipeline
        p.sync()
        jedis.close()
      }
    })
    ssc.start()
    ssc.awaitTermination()
  }

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值