spark streaming以direct方式将offset保存在zookeeper

最新推荐文章于 2022-10-12 22:42:41 发布

billows9297

最新推荐文章于 2022-10-12 22:42:41 发布

阅读量452

点赞数

文章标签： kafka spark

本文链接：https://blog.csdn.net/wjt199866/article/details/107263160

版权

direct_offset_zookeeper类：

/**
 *
 * @description: 使用Spark-Kafka-0-10版本整合,并手动提交偏移量,维护到Zookeeper中
 * @author: wanjintao
 * @time: 2020/6/27 13:48
 *
 */
package com.offsetzookeeper
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext, TaskContext}


//手动控制spark 消费 kafka的偏移度
//保证spark在意外退出时，重启程序数据不丢失

object direct_offset_zookeeper {

  Logger.getLogger("org").setLevel(Level.WARN)

  //zookeeper 实例化，方便后面对zk的操作
  val zk = ZkWork

  def main(args: Array[String]): Unit = {

    val Array(brokers, topic, group, sec) = args

    val conf = new SparkConf().setAppName("direct_offset_zookeeper").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(sec.toInt))

    val topics = Array(topic)
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> group,
      "auto.offset.reset" -> "earliest",    //"latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
      //你可以通过增大会话时间（max.poll.interval.ms)
      // 或者减小poll()方法处理的最大记录条数（max.poll.records)
      //"max.poll.interval.ms" -> "KAFKA_MAX_POLL_INTERVAL_MS",
      //"max.poll.records" -> "KAFKA_MAX_POLL_RECORDS"
    )


//    判断zk中是否有保存过该计算的偏移量
//    如果没有保存过,使用不带偏移量的计算,在计算完后保存
//    精髓就在于KafkaUtils.createDirectStream这个地方
//    默认是KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))，不加偏移度参数
//    实在找不到办法，最后啃了下源码。发现可以使用偏移度参数


    val stream = if (zk.znodeIsExists(s"${topic}offset")) {
      val nor = zk.znodeDataGet(s"${topic}offset")
      val newOffset = Map(new TopicPartition(nor(0).toString, nor(1).toInt) -> nor(2).toLong)//创建以topic，分区为k 偏移度为v的map

      println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
      println(s"[ DealFlowBills2 ] topic ${nor(0).toString}")
      println(s"[ DealFlowBills2 ] Partition ${nor(1).toInt}")
      println(s"[ DealFlowBills2 ] offset ${nor(2).toLong}")
      println(s"[ DealFlowBills2 ] zk中取出来的kafka偏移量★★★ $newOffset")
      println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
      KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams, newOffset))
    } else {
      println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
      println(s"[ DealFlowBills2 ] 第一次计算,没有zk偏移量文件")
      println(s"[ DealFlowBills2 ] 手动创建一个偏移量文件 ${topic}offset 默认从0号分区 0偏移度开始计算")
      println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
      zk.znodeCreate(s"${topic}offset", s"$topic,0,0")
      val nor = zk.znodeDataGet(s"${topic}offset")
      val newOffset = Map(new TopicPartition(nor(0).toString, nor(1).toInt) -> nor(2).toLong)
      KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams, newOffset))
    }


    //业务处理代码
    val lines = stream.map(_.value)
    val words = lines.flatMap(_.split(" "))
    val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
    wordCounts.print()

    //保存偏移度部分
    //如果在计算的时候失败了，会接着上一次偏移度进行重算，不保存新的偏移度
    //计算成功后保存偏移度

    stream.foreachRDD {
      rdd =>
        val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
        rdd.foreachPartition {
          iter =>
            val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
            println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
            println(s"[ DealFlowBills2 ]  topic: ${o.topic}")
            println(s"[ DealFlowBills2 ]  partition: ${o.partition} ")
            println(s"[ DealFlowBills2 ]  fromOffset 开始偏移量: ${o.fromOffset} ")
            println(s"[ DealFlowBills2 ]  untilOffset 结束偏移量: ${o.untilOffset} 需要保存的偏移量,供下次读取使用★★★")
            println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
            // 写zookeeper
            zk.offsetWork(s"${o.topic}offset", s"${o.topic},${o.partition},${o.untilOffset}")

          // 写本地文件系统
          // val fw = new FileWriter(new File("/home/hadoop1/testjar/test.log"), true)
          // fw.write(offsetsRangerStr)
          // fw.close()
        }
    }

    //最后结果保存到hdfs
    //result.saveAsTextFiles(output + s"/output/" + "010")
    //spark streaming 开始工作
    ssc.start()
    ssc.awaitTermination()

  }
}

ZkWork类：

/**
 *
 * @description: 使用Spark-Kafka-0-10版本整合,并手动提交偏移量,维护到Zookeeper中
 * @author: wanjintao
 * @time: 2020/6/27 14:56
 *
 */
package com.offsetzookeeper
import org.apache.zookeeper.ZooDefs.Ids
import org.apache.zookeeper.{CreateMode, WatchedEvent, Watcher, ZooKeeper}

object ZkWork {
  val TIME_OUT = 5000
  var zooKeeper: ZooKeeper = _

  def watcher = new Watcher() {
    def process(event: WatchedEvent) {
      println(s"[ ZkWork ] process : " + event.getType)
    }
  }

  /** ***************************************************************************************************************
   * 基础方法
   * 连接zk,创建znode,更新znode
   */
  def connect() {
    println(s"[ ZkWork ] zk connect")
    zooKeeper = new ZooKeeper("10.103.104.181:2181,10.103.104.182:2181,10.103.104.183:2181", TIME_OUT, watcher)
  }

  def znodeCreate(znode: String, data: String) {
    println(s"[ ZkWork ] zk create /$znode , $data")
    zooKeeper.create(s"/$znode", data.getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT)
  }

  def znodeDataSet(znode: String, data: String) {
    println(s"[ ZkWork ] zk data set /$znode")
    zooKeeper.setData(s"/$znode", data.getBytes(), -1)
  }

  /** ***************************************************************************************************************
   * 工作方法
   * 获得znode数据
   * 判断znode是否存在
   * 更新znode数据
   */
  def znodeDataGet(znode: String): Array[String] = {
    connect()
    println(s"[ ZkWork ] zk data get /$znode")
    try {   //同步获取这个path对应的目录节点存储的数据，watch参数来标识是否使用上文提到的默认Watcher来进行注册,stat可以用来指定数据的版本等信息，同时还可以设置是否监控这个目录节点数据的状态
      new String(zooKeeper.getData(s"/$znode", true, null), "utf-8").split(",")
    } catch {
      case _: Exception => Array()   //如果try正常执行，则没有异常抛出；否则执行Array操作
    }
  }

  def znodeIsExists(znode: String): Boolean ={
    connect()
    println(s"[ ZkWork ] zk znode is exists /$znode")
    zooKeeper.exists(s"/$znode", true) match {
      case null => false
      case _ => true
    }
  }

  def offsetWork(znode: String, data: String) {
    connect()
    println(s"[ ZkWork ] offset work /$znode")
    zooKeeper.exists(s"/$znode", true) match {
      case null => znodeCreate(znode, data)
      case _ => znodeDataSet(znode, data)
    }
    println(s"[ ZkWork ] zk close★★★")
    zooKeeper.close()
  }


}

offset保存在mysql
offset保存在redis

billows9297

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
spark streaming以direct方式将offset保存在zookeeper

direct_offset_zookeeper类：/** * * @description: 使用Spark-Kafka-0-10版本整合,并手动提交偏移量,维护到Zookeeper中 * @author: wanjintao * @time: 2020/6/27 13:48 * */package com.offsetzookeeperimport org.apache.kafka.common.TopicPartitionimport org.apache.kafka.commo
复制链接

扫一扫