SparkStreaming 管理offset 存储到Hbase中

最新推荐文章于 2019-09-19 15:36:32 发布

bigdataCoding

最新推荐文章于 2019-09-19 15:36:32 发布

阅读量718

点赞数 1

分类专栏：大数据 Spark

本文链接：https://blog.csdn.net/UnionIBM/article/details/79991985

版权

大数据同时被 2 个专栏收录

35 篇文章 0 订阅

订阅专栏

Spark

20 篇文章 0 订阅

订阅专栏

package com.demo.cn.streaming

import kafka.utils.ZkUtils
import org.apache.hadoop.hbase.filter.PrefixFilter
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put, Scan}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.kafka010.ConsumerStrategies._
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.kafka010.LocationStrategies._
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
  * Created by codeman
  */
object KafkaOffsetsBlogStreamingDriver {

  def main(args: Array[String]) {
    /*
        if (args.length < 6) {
          System.err.println("Usage: KafkaDirectStreamTest <batch-duration-in-seconds> <kafka-bootstrap-servers> " +
            "<kafka-topics> <kafka-consumer-group-id> <hbase-table-name> <kafka-zookeeper-quorum>")
          System.exit(1)
        }
    */
    //批处理的时间单位
    val batchDuration = 1
    val bootstrapServers = "10.1.69.11:6667,10.1.69.12:6667,10.1.69.13:6667".toString
    val topicsSet = "".toString.split(",").toSet
    val consumerGroupID = ""
    val hbaseTableName = "stream_kafka_offsets"
    val zkQuorum = "10.1.69.11,10.1.69.12,10.1.69.13"
    val zkKafkaRootDir = "kafka"
    val zkSessionTimeOut = 10000
    val zkConnectionTimeOut = 10000
    //存储checkpoint
    val checkDir="E:\\file\\SparkCheckpoint"

    val topics = topicsSet.toArray
    val topic = topics(0)

    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> bootstrapServers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> consumerGroupID,
      "auto.offset.reset" -> "earliest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    //如果存在就从原来的取
    val ssc = StreamingContext.getOrCreate(checkDir, () => functionToCreateContext(topic,consumerGroupID,hbaseTableName,zkQuorum,zkKafkaRootDir
    ,batchDuration,zkSessionTimeOut,zkConnectionTimeOut,kafkaParams,checkDir))
    ssc.start()
    ssc.awaitTermination()

  }

    def functionToCreateContext(topic:String,consumerGroupID:String,hbaseTableName:String,zkQuorum:String,zkKafkaRootDir:String,
      batchDuration:Int,zkSessionTimeOut:Int,zkConnectionTimeOut:Int,kafkaParams:Map[String,Object],checkDir:String): StreamingContext = {
    val spark=SparkSession.builder()
      .appName("KafkaOffsetsBlogStreamingDriver")
      .master("local[*]")
      .getOrCreate()
    val sc=spark.sparkContext
    val ssc = new StreamingContext(sc, Seconds(batchDuration.toLong))
      // set logDatacheckpoint directory
      ssc.checkpoint(checkDir)
      //从hbase中获取相应topic中的offset
    val fromOffsets = getLastCommittedOffsets(topic, consumerGroupID, hbaseTableName, zkQuorum, zkKafkaRootDir,
      zkSessionTimeOut, zkConnectionTimeOut)
      //
    val inputDStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Assign[String, String](
      fromOffsets.keys, kafkaParams, fromOffsets))

    inputDStream.foreachRDD((rdd, batchTime) => {
      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      offsetRanges.foreach(offset => println(offset.topic, offset.partition, offset.fromOffset, offset.untilOffset))
      val newRDD = rdd.map(message => processMessage(message))
      newRDD.count()
      //save the offsets to HBase  批量处理把数据存储到Hbase当中
      saveOffsets(topic, consumerGroupID, offsetRanges, hbaseTableName, batchTime)
    })
    ssc
  }


  /**
    * 对数据进行处理
    * @param message
    * @return
    */
  def processMessage(message: ConsumerRecord[String, String]): ConsumerRecord[String, String] = {
    message
  }


  /*
Save Offsets into HBase
 */
  def saveOffsets(TOPIC_NAME: String, GROUP_ID: String, offsetRanges: Array[OffsetRange], hbaseTableName: String,
                  batchTime: org.apache.spark.streaming.Time) = {
    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.addResource("src/main/resources/hbase-site.xml")
    val conn = ConnectionFactory.createConnection(hbaseConf)
    val table = conn.getTable(TableName.valueOf(hbaseTableName))
    val rowKey = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(batchTime.milliseconds)
    val put = new Put(rowKey.getBytes)
    for (offset <- offsetRanges) {
      put.addColumn(Bytes.toBytes("offsets"), Bytes.toBytes(offset.partition.toString),
        Bytes.toBytes(offset.untilOffset.toString))
    }
    table.put(put)
    conn.close()
  }

  /*
Returns last committed offsets for all the partitions of a given topic from HBase in following cases.
  - CASE 1: SparkStreaming job is started for the first time. This function gets the number of topic partitions from
    Zookeeper and for each partition returns the last committed offset as 0
  - CASE 2: SparkStreaming is restarted and there are no changes to the number of partitions in a topic. Last
    committed offsets for each topic-partition is returned as is from HBase.
  - CASE 3: SparkStreaming is restarted and the number of partitions in a topic increased. For old partitions, last
    committed offsets for each topic-partition is returned as is from HBase as is. For newly added partitions,
    function returns last committed offsets as 0
 */
  def getLastCommittedOffsets(TOPIC_NAME: String, GROUP_ID: String, hbaseTableName: String, zkQuorum: String,
                              zkRootDir: String, sessionTimeout: Int, connectionTimeOut: Int): Map[TopicPartition, Long] = {

    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.addResource("src/main/resources/hbase-site.xml")
    val zkUrl = zkQuorum + "/" + zkRootDir
    val zkClientAndConnection = ZkUtils.createZkClientAndConnection(zkUrl, sessionTimeout, connectionTimeOut)
    val zkUtils = new ZkUtils(zkClientAndConnection._1, zkClientAndConnection._2, false)
    val zKNumberOfPartitionsForTopic = zkUtils.getPartitionsForTopics(Seq(TOPIC_NAME)).get(TOPIC_NAME).toList.head.size

    //Connect to HBase to retrieve last committed offsets
    val conn = ConnectionFactory.createConnection(hbaseConf)
    val table = conn.getTable(TableName.valueOf(hbaseTableName))
    val startRow = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(System.currentTimeMillis())
    val stopRow = TOPIC_NAME + ":" + GROUP_ID + ":" + 0
    val scan = new Scan()
    val scanner = table.getScanner(scan.setStartRow(startRow.getBytes).setStopRow(stopRow.getBytes).setReversed(true))
    val result = scanner.next()

    var hbaseNumberOfPartitionsForTopic = 0 //Set the number of partitions discovered for a topic in HBase to 0
    if (result != null) {
      //If the result from hbase scanner is not null, set number of partitions from hbase to the number of cells
      hbaseNumberOfPartitionsForTopic = result.listCells().size()
    }

    val fromOffsets = collection.mutable.Map[TopicPartition, Long]()

    if (hbaseNumberOfPartitionsForTopic == 0) {
      // initialize fromOffsets to beginning
      for (partition <- 0 to zKNumberOfPartitionsForTopic - 1) {
        fromOffsets += (new TopicPartition(TOPIC_NAME, partition) -> 0)
      }
    } else if (zKNumberOfPartitionsForTopic > hbaseNumberOfPartitionsForTopic) {
      // handle scenario where new partitions have been added to existing kafka topic
      for (partition <- 0 to hbaseNumberOfPartitionsForTopic - 1) {
        val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"), Bytes.toBytes(partition.toString)))
        fromOffsets += (new TopicPartition(TOPIC_NAME, partition) -> fromOffset.toLong)
      }
      for (partition <- hbaseNumberOfPartitionsForTopic to zKNumberOfPartitionsForTopic - 1) {
        fromOffsets += (new TopicPartition(TOPIC_NAME, partition) -> 0)
      }
    } else {
      //initialize fromOffsets from last run
      for (partition <- 0 to hbaseNumberOfPartitionsForTopic - 1) {
        val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"), Bytes.toBytes(partition.toString)))
        fromOffsets += (new TopicPartition(TOPIC_NAME, partition) -> fromOffset.toLong)
      }
    }
    scanner.close()
    conn.close()
    fromOffsets.toMap
  }
}

bigdataCoding

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
SparkStreaming 管理offset 存储到Hbase中

package com.demo.cn.streamingimport kafka.utils.ZkUtilsimport org.apache.hadoop.hbase.filter.PrefixFilterimport org.apache.hadoop.hbase.util.Bytesimport org.apache.hadoop.hbase.{HBaseConfiguratio...
复制链接

扫一扫