spark读kafka数据 hbase存储

最新推荐文章于 2023-05-12 23:15:48 发布

mllhxn

最新推荐文章于 2023-05-12 23:15:48 发布

阅读量887

点赞数

分类专栏： spark

spark 专栏收录该内容

44 篇文章 0 订阅

订阅专栏

package spark88.utils

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.ZkUtils
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put, Scan}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * Created by hadoop on 下午4:52.
  */
object KafkaHbaseManager {

  def saveOffsets(TOPIC_NAME:String,GROUP_ID:String,offsetRanges:Array[OffsetRange],
                  hbaseTableName:String,batchTime: String) ={
    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.addResource("/etc/hbase/conf/hbase-site.xml")
    val conn = ConnectionFactory.createConnection(hbaseConf)
    val table = conn.getTable(TableName.valueOf(hbaseTableName))
    val rowKey = TOPIC_NAME + ":" + GROUP_ID + ":" + batchTime
    val put = new Put(rowKey.getBytes)
    for(offset <- offsetRanges){
      put.addColumn(Bytes.toBytes("offsets"),Bytes.toBytes(offset.partition.toString),
        Bytes.toBytes(offset.untilOffset.toString))
    }
    table.put(put)  // 行级事务
    conn.close()
  }


  def getNumberOfPartitionsForTopicFromZK(TOPIC_NAME:String,GROUP_ID:String,
                                          zkQuorum:String,zkRootDir:String,sessTimeout:Int,connTimeOut:Int): Int ={
    val zkUrl = zkQuorum+"/"+zkRootDir
    val zkClientAndConn= ZkUtils.createZkClientAndConnection(zkUrl, sessTimeout,connTimeOut)
    val zkUtils = new ZkUtils(zkClientAndConn._1, zkClientAndConn._2,false)
    val zKPartitions= zkUtils.getPartitionsForTopics(Seq(TOPIC_NAME
    )).get(TOPIC_NAME).toList.head.size
    println(zKPartitions)
    zkClientAndConn._1.close()
    zkClientAndConn._2.close()
    zKPartitions
  }


  def getFromOffsetResults(TOPIC_NAME:String,GROUP_ID:String,hTableName:String,
                              zkQuorum:String,zkRootDir:String,sessTimeout:Int,connTimeOut:Int):Tuple2[String,Map[TopicAndPartition,Long]] ={


    val zKNumberOfPartitions =getNumberOfPartitionsForTopicFromZK(TOPIC_NAME, GROUP_ID, zkQuorum,zkRootDir,sessTimeout,connTimeOut)

    val hbaseConf = HBaseConfiguration.create()

    // 获取hbase中最后提交的offset
    val conn = ConnectionFactory.createConnection(hbaseConf)
    val table = conn.getTable(TableName.valueOf(hTableName))
    val startRow = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(System.currentTimeMillis())
    val stopRow = TOPIC_NAME + ":" + GROUP_ID + ":" + 0
    val scan = new Scan()
    val scanner = table.getScanner(scan.setStartRow(startRow.getBytes).setStopRow(stopRow.getBytes)
      .setReversed(true))
    val result = scanner.next()
    var hbaseNumberOfPartitions = 0 // 在hbase中获取的分区数量
    var BATCH_ID = TOPIC_NAME + "-" + GROUP_ID + "-1"
    if (result != null){
      // 将分区数量设置为hbase表的列数量
      hbaseNumberOfPartitions = result.listCells().size()
      val rowkey = new String(result.getRow)
      BATCH_ID = rowkey.replaceAll(":", "-")
    }

    val fromOffsets = collection.mutable.Map[TopicAndPartition,Long]()
    if(hbaseNumberOfPartitions == 0){
      // 初始化kafka为开始
      for (partition <- 0 to zKNumberOfPartitions-1){
        fromOffsets += ((TopicAndPartition(TOPIC_NAME,partition), 0))
      }
    } else if(zKNumberOfPartitions > hbaseNumberOfPartitions){
      // 处理新增加的分区添加到kafka的topic
      for (partition <- 0 to hbaseNumberOfPartitions-1){
        val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),
          Bytes.toBytes(partition.toString)))
        fromOffsets += ((TopicAndPartition(TOPIC_NAME,partition), fromOffset.toLong))
      }
      for (partition <- hbaseNumberOfPartitions to zKNumberOfPartitions-1){
        fromOffsets += ((TopicAndPartition(TOPIC_NAME,partition), 0))
      }
    } else {
      // 获取上次运行的offset
      for (partition <- 0 to hbaseNumberOfPartitions-1 ){
        val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),
          Bytes.toBytes(partition.toString)))
        fromOffsets += ((TopicAndPartition(TOPIC_NAME,partition), fromOffset.toLong))
      }
    }

    scanner.close()
    conn.close()
    (BATCH_ID, fromOffsets.toMap)
  }

  def main(args: Array[String]): Unit = {
   // getLastCommittedOffsets("mytest1", "testp", "stream_kafka_offsets", "spark123:12181", "kafka0.9", 30000, 30000)

    val processingInterval = 2
    val brokers = "spark123:9092"
    val topics = "mytest1"
    // Create context with 2 second batch interval
    val sparkConf = new SparkConf().setAppName("kafkaHbase").setMaster("local[2]")
    // Create direct kafka stream with brokers and topics
    val topicsSet = topics.split(",").toSet
    val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers,
      "auto.offset.reset" -> "smallest")


    val ssc = new StreamingContext(sparkConf, Seconds(processingInterval))
    val groupId = "testp"
    val hbaseTableName = "spark_kafka_offsets"

    // 获取kafkaStream
    //val kafkaStream = createMyDirectKafkaStream(ssc, kafkaParams, zkClient, topicsSet, "testp")
    val messageHandler = (mmd : MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
     val fromOffsetResults = getFromOffsetResults("mytest1", groupId,hbaseTableName , "spark123:12181", "kafka0.9", 30000, 30000)
    val batchid = fromOffsetResults._1
    val fromOffsets = fromOffsetResults._2

    var kafkaStream : InputDStream[(String, String)] = null
    kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)


    kafkaStream.foreachRDD((rdd,btime) => {
      if(!rdd.isEmpty()){
        println("==========================:" + rdd.count() )
        println("==========================btime:" + btime.toString() )
        saveOffsets(topics, groupId, rdd.asInstanceOf[HasOffsetRanges].offsetRanges, hbaseTableName, btime.toString())
      }
    })

    //val offsetsRanges:Array[OffsetRange] = null

    ssc.start()
    ssc.awaitTermination()


  }
}

package spark88

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka4.utils.MyKafkaUtils.getResetOffsets
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.kafka.{Broker, KafkaUtils, OffsetRange}
import org.apache.spark.{SparkConf, SparkContext, TaskContext}
import spark88.utils.KafkaHbaseManager.getFromOffsetResults
import spark88.utils.{KafkaHbaseManager, ParseUtils}

import scala.collection.mutable


/**
  * Created by hadoop on 上午1:28.
  * 创建topic：
  * kafka-topics.sh --zookeeper localhost:12181/kafka0.9 --create --topic myat --partitions 2 --replication-factor 1
  * kafka-topics.sh --zookeeper localhost:12181/kafka0.9 --describe --topic myat
  * kafka-console-producer.sh --broker-list localhost:9092 --topic myat
  * kafka-console-consumer.sh --zookeeper localhost:12181/kafka0.9 --topic myat
  */
object KafkaETL {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("test").setMaster("local[2]")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new HiveContext(sc)

    val brokers = "spark1234:9092"
    val topic = "myat"
    val topics = topic.split(",").toSet
    val groupName = "testg"
    val zQuorum = "spark123:12181"
    val zkRootDir = "kafka0.9"
    val sessionTimeOut = 3000
    val connTimeOut = 3000
    val hbaseTableName = "spark_kafka_offsets"
    val outputPath = "/hadoop/kafka22/"



    // 获取topic中有效的开始offset
    val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers, "auto.offset.reset" -> "smallest")
    val fromOffsetResults = KafkaHbaseManager.getFromOffsetResults(topic, groupName, hbaseTableName, zQuorum, zkRootDir, sessionTimeOut, connTimeOut)
    val batchid = fromOffsetResults._1
    val fromOffsets = fromOffsetResults._2

    // 获取topic中有效的最新offset
    val kafkaParamsLargest = Map[String, String]("metadata.broker.list" -> brokers, "auto.offset.reset" -> "largest")
    val largestOffsets = getResetOffsets(kafkaParamsLargest, topics)


    // 每个partition消费最大消息条数
    val maxMsgNumPerPartition = 100000l
    val offsetRanges = fromOffsets.keys.map(tp => {
      val fromOffset = fromOffsets(tp)
      val largestOffset = largestOffsets(tp)
      val untilOffset = Math.min(largestOffset, fromOffset + maxMsgNumPerPartition)
      OffsetRange(tp, fromOffset, untilOffset)
    }).toArray


    val messageHandler = (mmd: MessageAndMetadata[String, String]) => {
      (mmd.offset, mmd.topic, mmd.partition, mmd.message())
    }


    val map: Map[TopicAndPartition, Broker] = Map()
    val rdd = KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder,
      (Long, String, Int, String)](sc, kafkaParams, offsetRanges, map, messageHandler)


    val rowRDD = rdd.map(x => ParseUtils.getRow(x))
    //rowRDD.collect().foreach(println)
    if (!rdd.isEmpty()) {
      val logDF = sqlContext.createDataFrame(rowRDD, ParseUtils.struct)
      logDF.show()
      logDF.printSchema()

      val outputTempLocation = outputPath + "tmp/" + batchid
      logDF.write.format("orc").mode(SaveMode.Overwrite).
        partitionBy("houseid", "dayid", "hourid").save(outputTempLocation)

      moveTempFilesToData(outputPath, batchid)

     val curTimeMil = String.valueOf(System.currentTimeMillis())
     KafkaHbaseManager.saveOffsets(topic, groupName, offsetRanges, hbaseTableName, curTimeMil)
    }


  }

  def moveTempFilesToData(outputPath:String, batchid:String) = {
    val conf = new Configuration
    conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"))
    conf.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"))
    val fileSystem = FileSystem.get(conf)


    // get Partitions
    val partitionsSet = new mutable.HashSet[String]()
    fileSystem.globStatus(new Path(outputPath + "tmp/" + batchid + "/houseid=*/dayid=*/hourid=*/*.orc")).
      foreach(x=>{
        val fileAbsolutePath = x.getPath.toString
        val fileWithPartition = fileAbsolutePath.replace(fileAbsolutePath.substring(0, fileAbsolutePath.indexOf("/tmp/")) + "/tmp/" + batchid, "")
        val partition = fileWithPartition.substring(0, fileWithPartition.lastIndexOf("/") )
        partitionsSet.add(partition)
      })
    println("partitionsSet:" + partitionsSet)

    // Delete Data Files
    partitionsSet.foreach(p=>{
      fileSystem.globStatus(new Path(outputPath + "data" + p + "/" + batchid + "*.orc")).foreach(
        f=>{
          fileSystem.delete(f.getPath(), false)
        }
      )
    })

    // Move to Data Files
    fileSystem.globStatus(new Path(outputPath + "tmp/" + batchid + "/houseid=*/dayid=*/hourid=*/*.orc")).
      foreach(x=>{
        val fileAbsolutePath = x.getPath.toString
        val fileDir = fileAbsolutePath.substring(0,fileAbsolutePath.lastIndexOf("/"))
        val fileName = fileAbsolutePath.substring(fileAbsolutePath.lastIndexOf("/") + 1)
        val dataDir = fileDir.replace("tmp/" + batchid, "data")
        if(!fileSystem.exists(new Path(dataDir))){
          fileSystem.mkdirs(new Path(dataDir))
        }
        fileSystem.rename(new Path(fileAbsolutePath), new Path(dataDir + "/" + batchid + "-" + fileName))

    })

  }
}

mllhxn

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
spark读kafka数据 hbase存储

package spark88.utilsimport kafka.common.TopicAndPartitionimport kafka.message.MessageAndMetadataimport kafka.serializer.StringDecoderimport kafka.utils.ZkUtilsimport org.apache.hadoop.hbase...
复制链接

扫一扫

专栏目录