zk、hbase、redis保存Sparkstreaming 的offset

最新推荐文章于 2023-01-30 21:10:22 发布

嘉平11

最新推荐文章于 2023-01-30 21:10:22 发布

阅读量217

点赞数 1

分类专栏： Spark 文章标签： kafka spark

本文链接：https://blog.csdn.net/zgm12/article/details/106974714

版权

Spark 专栏收录该内容

28 篇文章 1 订阅

订阅专栏

一、zk:


import cn.qphone.spark.`trait`.LoggerTrait
import cn.qphone.spark.utils.{CommonUtils, SparkUtils}
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}

import scala.collection.{JavaConversions, mutable}

object Demo6_SparkStreaming_Kafka_Zookeeper extends LoggerTrait{

    // zookeeper的客户端
    val client = {
        val client = CuratorFrameworkFactory.builder()
            .connectString("hbase1,hbase2,hbase3")
            .retryPolicy(new ExponentialBackoffRetry(1000, 3))
            .namespace("kafka/consumers/offsets")
            .build()
        client.start()
        client
    }


    def main(args: Array[String]): Unit = {
        //1. 入口
        val ssc = SparkUtils.getLocalStreamingContext("Demo6_SparkStreaming_Kafka_Zookeeper", 2)
        val kafkaParams = CommonUtils.toMap("demo6.properties")
        val topics = "bjbigdata1909-1".split(",").toSet

        //2. 加载数据
        val messages:InputDStream[(String, String)] = createMsg(ssc, kafkaParams, topics)

        //3. 遍历消息
        messages.foreachRDD((rdd, btime) => {
            if (!rdd.isEmpty()) {
                //3.1 将偏移量读取到东西打印
                println("-"*100)
                println(s"bTime = ${btime}")
                println("#"*50 + "     " + rdd.count())
                //3.2 保存最新的偏移量到zookeeper
                storeOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, kafkaParams("group.id"))
            }
        })

        ssc.start()
        ssc.awaitTermination()
    }

    /**
     * 从zk中读取手动保存offset信息，然后从kafka指定offset位置开始读取数据，如果没有读取到offset信息，那么从开始位置开始读取信息
     */
    def createMsg(ssc:StreamingContext, kafkaParams:Map[String, String], topics:Set[String]):InputDStream[(String, String)] = {
        //1. 从zookeeper中读取offset信息
        val fromOffsets:Map[TopicAndPartition, Long] = getFromOffsets(topics, kafkaParams("group.id"))
        //2. 读取外部数据
        var messages:InputDStream[(String, String)] = null
        //2.1 判断
        if (fromOffsets.isEmpty) { // 如果没有读取到偏移量，说明之前从来没有保存过，从开始的位置开始读取
            messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
        }else { //读取到了偏移量，从指定位置开始读取
            //2.2 创建messageHandler
            val messageHandler = (msgHandler:MessageAndMetadata[String, String]) => (msgHandler.key(), msgHandler.message())
            //2.3 读取指定位置的offset的数据
            messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
        }
        messages
    }

    /**
     * 根据主题和消费者组来获取到对应的偏移量
     * 首先我们定义了offset保存在zookeeper的哪一个目录下：/kafka/topic/group/partition
     * 其次，partition中保存了的数据就是我们的offset
     */
    def getFromOffsets(topics:Set[String], group:String) : Map[TopicAndPartition, Long] = {
        //1. 定义一个结构专门保存偏移量
        val offsets = mutable.Map[TopicAndPartition, Long]()
        //2. 遍历主题
        for(topic <- topics) {
            //2.1 自定义offset在zookeeper的位置
            val path = s"${topic}/${group}"
            //2.2 判断zookeeper中此path路径是否存在
            isExists(path)
            //2.3 遍历获取分区:还需要将java的数组转换位scala的数组
            for(partition <- JavaConversions.asScalaBuffer(client.getChildren.forPath(path))) {
                //2.3.1 这个路径是用来保存偏移量
                val fullPath = s"${path}/${partition}"
                //2.3.2 获取偏移量
                val offset = new String(client.getData.forPath(fullPath)).toLong
                //2.3.3 数据保存offsets
                offsets.put(TopicAndPartition(topic, partition.toInt), offset)
            }
        }
        offsets.toMap
    }

    /**
     * 判断节点是否存在，如果不存在就创建之
     */
    def isExists(path:String):Unit = {
        if (client.checkExists().forPath(path) == null) { // 如果路径不存在
            client.create().creatingParentsIfNeeded().forPath(path)
        }
    }

    /**
     * 将偏移量保存会zookeeper
     */
    def storeOffsets(offsetRanges:Array[OffsetRange], group:String) = {
        //1. 遍历偏移量范围的数组
        for(offsetRange <- offsetRanges) {
            //2. 获取主题分区以及偏移量
            val topic = offsetRange.topic
            val partition = offsetRange.partition
            val untilOffset = offsetRange.untilOffset
            //3. 创建保存在zookeeper上的目录
            val path = s"${topic}/${group}/${partition}"
            isExists(path)
            //4. 保存偏移量到partition
            client.setData().forPath(path, untilOffset.toString.getBytes())
        }
    }
}

二、HBase保存

1.工具类


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.junit.Test;

import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;

/**
 * HBase连接的客户端工具类
 */
public class HBaseUtils {

    // pool就是存放hbase的连接对象的连接池
    private static LinkedList<Connection> pool = new LinkedList<>();

    // 初始化5条hbase的连接对象到连接池中
    static {
        try {
            Configuration conf = new Configuration();
            conf.set("hbase.rootdir", "hdfs://hbase1:9000");
            conf.set("hbase.cluster.distributed", "true");
            conf.set("hbase.zookeeper.quorum", "hbase1,hbase2,hbase3");
            conf.set("hbase.regionserver.wal.codec", "org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec");
            for (int i = 0;i < 5;i++) {
                pool.push(ConnectionFactory.createConnection(conf));
            }
        }catch (Exception e) {
            e.printStackTrace();
        }
    }

    // 获取连接对象
    public static Connection getConnection() {
        while (pool.isEmpty()) {
            try {
                System.out.println("connection pool is null, please wait for a moment~~~");
                Thread.sleep(1000);
            }catch (Exception e) {
                e.printStackTrace();
            }
        }
        return pool.poll();
    }

    // 释放连接对象，将连接对象归还给连接池
    public static void release(Connection connection) {
        pool.push(connection);
    }

    // 根据参数创建表
    public static Map<Integer, Long> getColValue(Connection connection, TableName tableName, byte[] rk, byte[] cf) {
        //1. 声明map存放最终结果
        Map<Integer, Long> partition2Offset = new HashMap<>();
        try {
            //2. 获取到表对象
            Table table = connection.getTable(tableName);
            Scan scan = new Scan();
            //3. 条件
            Filter filter = new RowFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(rk));
            scan.setFilter(filter);
            //4. 创建扫描器
            ResultScanner scanner = table.getScanner(scan);
            //5. 遍历
            for (Result result : scanner) {
                List<Cell> cells = result.listCells(); // 获取到每一个cell(k,v)
                for (Cell cell : cells) {
                    //col
                    byte[] column = CellUtil.cloneQualifier(cell);
                    //value
                    byte[] values = CellUtil.cloneValue(cell);

                    int partition = Integer.valueOf(new String(column));
                    long offset = Long.valueOf(new String(values));

                    partition2Offset.put(partition, offset);
                }
            }
            return partition2Offset;

        }catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    //将col和value设置到hbase
    public static void set(Connection connection, TableName tableName, byte[] rk, byte[] cf, byte[] col, byte[] value) {
        try {
            Table table = connection.getTable(tableName);
            Put put = new Put(rk);
            put.addColumn(cf, col, value);
            table.put(put);
            table.close();
        }catch (Exception e) {
            e.printStackTrace();
        }
    }
}

2、正式代码：

import cn.qphone.hbase.utils.HBaseUtils
import cn.qphone.spark.`trait`.LoggerTrait
import cn.qphone.spark.utils.{CommonUtils, SparkUtils}
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}

import scala.collection.{JavaConversions, mutable}

/**
 * 一 使用hbase来手动管理offset信息，保证数据被依次消费
 * 1. 有：从指定的offset位置开始消费
 * 2. 没有：从offset为0开始消费
 *
 * 二 使用指定的offset向kafka拉取数据
 * 三 拉取到数据之后进行业务处理
 * 四 offset需要重新更新到hbase
 *
 * create 'spark-topic-offset', 'cf'
 *
 * rowkey:topic-group
 * column:partition:offset
 * */
object Demo7_SparkStreaming_Kafka_HBase extends LoggerTrait{

    def main(args: Array[String]): Unit = {
        //1. 入口
        val ssc = SparkUtils.getLocalStreamingContext("Demo6_SparkStreaming_Kafka_Zookeeper", 2)
        val kafkaParams = CommonUtils.toMap("demo6.properties")
        val topics = "bjbigdata1909-1".split(",").toSet

        //2. 加载数据
        val messages:InputDStream[(String, String)] = createMsg(ssc, kafkaParams, topics)

        //3. 遍历消息
        messages.foreachRDD((rdd, btime) => {
            if (!rdd.isEmpty()) {
                //3.1 将偏移量读取到东西打印
                println("-"*100)
                println(s"bTime = ${btime}")
                println("#"*50 + "     " + rdd.count())
                //3.2 保存最新的偏移量到zookeeper
                storeOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, kafkaParams("group.id"))
            }
        })

        ssc.start()
        ssc.awaitTermination()
    }

    /**
     * 从hbase中读取手动保存offset信息，然后从kafka指定offset位置开始读取数据，如果没有读取到offset信息，那么从开始位置开始读取信息
     */
    def createMsg(ssc:StreamingContext, kafkaParams:Map[String, String], topics:Set[String]):InputDStream[(String, String)] = {
        //1. 从hbase中读取offset信息
        val fromOffsets:Map[TopicAndPartition, Long] = getFromOffsets(topics, kafkaParams("group.id"))
        //2. 读取外部数据
        var messages:InputDStream[(String, String)] = null
        //2.1 判断
        if (fromOffsets.isEmpty) { // 如果没有读取到偏移量，说明之前从来没有保存过，从开始的位置开始读取
            messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
        }else { //读取到了偏移量，从指定位置开始读取
            //2.2 创建messageHandler
            val messageHandler = (msgHandler:MessageAndMetadata[String, String]) => (msgHandler.key(), msgHandler.message())
            //2.3 读取指定位置的offset的数据
            messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
        }
        messages
    }

    /**
     * 根据主题和消费者组来获取到对应的偏移量
     * 首先我们定义了offset保存在hbase：
     * 其次，partition中保存了的数据就是我们的offset
     */
    def getFromOffsets(topics:Set[String], group:String) : Map[TopicAndPartition, Long] = {
        //1. 定义一个结构专门保存偏移量
        val offsets = mutable.Map[TopicAndPartition, Long]()
        //1.1 获取到HBase connection
        val connection = HBaseUtils.getConnection
        val tableName = TableName.valueOf("spark-topic-offset")
        val cf = Bytes.toBytes("cf")

        //2. 遍历主题
        for(topic <- topics) {
            //2.1 自定义rowkey
            val rk = s"${topic}-${group}".getBytes()
            //2.2 获取表的分区以及对应的偏移量
            val partition2Offsets = HBaseUtils.getColValue(connection, tableName, rk, cf)
            val partition2Offsets2 = JavaConversions.mapAsScalaMap(partition2Offsets)
            //2.3 遍历获取分区:还需要将java的数组转换位scala的数组
            for ((k, v) <- partition2Offsets2) {
                offsets.put(TopicAndPartition(topic, (k+"").toInt), v)
            }
        }
        HBaseUtils.release(connection)
        offsets.toMap
    }

    /**
     * 将偏移量保存会hbase
     */
    def storeOffsets(offsetRanges:Array[OffsetRange], group:String) = {
        //0.
        val connection = HBaseUtils.getConnection
        val tableName = TableName.valueOf("spark-topic-offset")
        val cf = Bytes.toBytes("cf")
        //1. 遍历偏移量范围的数组
        for(offsetRange <- offsetRanges) {
            //2. 获取主题分区以及偏移量
            val rk = s"${offsetRange.topic}-${group}".getBytes()
            val partition = offsetRange.partition
            val untilOffset = offsetRange.untilOffset

            //3. 将结果保存到hbase
           HBaseUtils.set(connection, tableName, rk, cf, (partition+"").getBytes(), (untilOffset+"").getBytes())
        }
    }

}

三、redis保存

package sparkStrreaming.day3

import java.util

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import utils.CommonUtils
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import redis.clients.jedis.Jedis

import scala.collection.{JavaConversions, mutable}


object Demo8_SparkStreaming_Kafka_Redis {


  val jedis = new Jedis("mini1",6379)

  def main(args: Array[String]): Unit = {


    val ssc = new StreamingContext(new SparkConf().setMaster("local[*]").setAppName("redis"),Seconds(2))
    val kafkaParams: Map[String, String] = CommonUtils.toMap("demo6.properties")
    val topics="test1".split("\\s+").toSet

    val messages: InputDStream[(String, String)] = createMsg(ssc,kafkaParams,topics)

    messages.foreachRDD((rdd,btime)=>{
      if(!rdd.isEmpty()){
        println("-"*100)
        println(s"btime=${btime}")
       println("#"*50)
        println(rdd.count())
updateOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges,kafkaParams("group.id"))
      }
    })


    ssc.start()
    ssc.awaitTermination()
  }



    def createMsg(ssc:StreamingContext,kafkaParams :Map[String,String],topics:Set[String]):InputDStream[(String,String)]={
      val fromOffsets: Map[TopicAndPartition, Long] = getFromOffsets(topics,kafkaParams("group.id"))
      var messages:InputDStream[(String,String)]=null
       if(fromOffsets.isEmpty){
          messages = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,topics)
       }else{
         val msgHandler=(msgHandler:MessageAndMetadata[String,String])=>(msgHandler.key(),msgHandler.message())
         messages=KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder,(String,String)](ssc,kafkaParams,fromOffsets,msgHandler)
       }
      messages
    }


    def getFromOffsets(topics: Set[String], group: String): Map[TopicAndPartition,Long]={
      var offsets=mutable.Map[TopicAndPartition,Long]()
      for(topic<-topics){
        val key=s"${topic}_${group}"
       // val str: String = jedis.get(key).toString
        val string: util.Map[String, String] = jedis.hgetAll(key)
        for(  partition<-JavaConversions.mapAsScalaMap(string)){
          offsets.put(TopicAndPartition(topic,partition._1.toInt),partition._2.toLong)
        }
      }
      offsets.toMap

    }


  def updateOffsets(offsetRanges: Array[OffsetRange], group: String) = {
    for(offsetRange<-offsetRanges){
val topic=offsetRange.topic
      val partition=offsetRange.partition
      val offset=offsetRange.untilOffset

      jedis.hset(s"${topic}_${group}",partition.toString,offset.toString)
    }
    }

}

嘉平11

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
zk、hbase、redis保存Sparkstreaming 的offset

一、zk:import cn.qphone.spark.`trait`.LoggerTraitimport cn.qphone.spark.utils.{CommonUtils, SparkUtils}import kafka.common.TopicAndPartitionimport kafka.message.MessageAndMetadataimport kafka.serializer.StringDecoderimport org.apache.curator.framewo
复制链接

扫一扫