zk、hbase、redis保存Sparkstreaming 的offset

一、zk:


    
    
  1. import cn.qphone.spark.` trait`.LoggerTrait
  2. import cn.qphone.spark.utils.{ CommonUtils, SparkUtils}
  3. import kafka.common. TopicAndPartition
  4. import kafka.message. MessageAndMetadata
  5. import kafka.serializer. StringDecoder
  6. import org.apache.curator.framework. CuratorFrameworkFactory
  7. import org.apache.curator.retry. ExponentialBackoffRetry
  8. import org.apache.spark.streaming. StreamingContext
  9. import org.apache.spark.streaming.dstream. InputDStream
  10. import org.apache.spark.streaming.kafka.{ HasOffsetRanges, KafkaUtils, OffsetRange}
  11. import scala.collection.{ JavaConversions, mutable}
  12. object Demo6_SparkStreaming_Kafka_Zookeeper extends LoggerTrait{
  13. // zookeeper的客户端
  14. val client = {
  15. val client = CuratorFrameworkFactory.builder()
  16. .connectString( "hbase1,hbase2,hbase3")
  17. .retryPolicy( new ExponentialBackoffRetry( 1000, 3))
  18. .namespace( "kafka/consumers/offsets")
  19. .build()
  20. client.start()
  21. client
  22. }
  23. def main(args: Array[ String]): Unit = {
  24. //1. 入口
  25. val ssc = SparkUtils.getLocalStreamingContext( "Demo6_SparkStreaming_Kafka_Zookeeper", 2)
  26. val kafkaParams = CommonUtils.toMap( "demo6.properties")
  27. val topics = "bjbigdata1909-1".split( ",").toSet
  28. //2. 加载数据
  29. val messages: InputDStream[( String, String)] = createMsg(ssc, kafkaParams, topics)
  30. //3. 遍历消息
  31. messages.foreachRDD((rdd, btime) => {
  32. if (!rdd.isEmpty()) {
  33. //3.1 将偏移量读取到东西打印
  34. println( "-"* 100)
  35. println( s"bTime = ${btime}")
  36. println( "#"* 50 + " " + rdd.count())
  37. //3.2 保存最新的偏移量到zookeeper
  38. storeOffsets(rdd.asInstanceOf[ HasOffsetRanges].offsetRanges, kafkaParams( "group.id"))
  39. }
  40. })
  41. ssc.start()
  42. ssc.awaitTermination()
  43. }
  44. /**
  45. * 从zk中读取手动保存offset信息,然后从kafka指定offset位置开始读取数据,如果没有读取到offset信息,那么从开始位置开始读取信息
  46. */
  47. def createMsg(ssc: StreamingContext, kafkaParams: Map[ String, String], topics: Set[ String]): InputDStream[( String, String)] = {
  48. //1. 从zookeeper中读取offset信息
  49. val fromOffsets: Map[ TopicAndPartition, Long] = getFromOffsets(topics, kafkaParams( "group.id"))
  50. //2. 读取外部数据
  51. var messages: InputDStream[( String, String)] = null
  52. //2.1 判断
  53. if (fromOffsets.isEmpty) { // 如果没有读取到偏移量,说明之前从来没有保存过,从开始的位置开始读取
  54. messages = KafkaUtils.createDirectStream[ String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
  55. } else { //读取到了偏移量,从指定位置开始读取
  56. //2.2 创建messageHandler
  57. val messageHandler = (msgHandler: MessageAndMetadata[ String, String]) => (msgHandler.key(), msgHandler.message())
  58. //2.3 读取指定位置的offset的数据
  59. messages = KafkaUtils.createDirectStream[ String, String, StringDecoder, StringDecoder, ( String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
  60. }
  61. messages
  62. }
  63. /**
  64. * 根据主题和消费者组来获取到对应的偏移量
  65. * 首先我们定义了offset保存在zookeeper的哪一个目录下:/kafka/topic/group/partition
  66. * 其次,partition中保存了的数据就是我们的offset
  67. */
  68. def getFromOffsets(topics: Set[ String], group: String) : Map[ TopicAndPartition, Long] = {
  69. //1. 定义一个结构专门保存偏移量
  70. val offsets = mutable. Map[ TopicAndPartition, Long]()
  71. //2. 遍历主题
  72. for(topic <- topics) {
  73. //2.1 自定义offset在zookeeper的位置
  74. val path = s"${topic}/${group}"
  75. //2.2 判断zookeeper中此path路径是否存在
  76. isExists(path)
  77. //2.3 遍历获取分区:还需要将java的数组转换位scala的数组
  78. for(partition <- JavaConversions.asScalaBuffer(client.getChildren.forPath(path))) {
  79. //2.3.1 这个路径是用来保存偏移量
  80. val fullPath = s"${path}/${partition}"
  81. //2.3.2 获取偏移量
  82. val offset = new String(client.getData.forPath(fullPath)).toLong
  83. //2.3.3 数据保存offsets
  84. offsets.put( TopicAndPartition(topic, partition.toInt), offset)
  85. }
  86. }
  87. offsets.toMap
  88. }
  89. /**
  90. * 判断节点是否存在,如果不存在就创建之
  91. */
  92. def isExists(path: String): Unit = {
  93. if (client.checkExists().forPath(path) == null) { // 如果路径不存在
  94. client.create().creatingParentsIfNeeded().forPath(path)
  95. }
  96. }
  97. /**
  98. * 将偏移量保存会zookeeper
  99. */
  100. def storeOffsets(offsetRanges: Array[ OffsetRange], group: String) = {
  101. //1. 遍历偏移量范围的数组
  102. for(offsetRange <- offsetRanges) {
  103. //2. 获取主题分区以及偏移量
  104. val topic = offsetRange.topic
  105. val partition = offsetRange.partition
  106. val untilOffset = offsetRange.untilOffset
  107. //3. 创建保存在zookeeper上的目录
  108. val path = s"${topic}/${group}/${partition}"
  109. isExists(path)
  110. //4. 保存偏移量到partition
  111. client.setData().forPath(path, untilOffset.toString.getBytes())
  112. }
  113. }
  114. }

 

 

二、HBase保存

1.工具类


    
    
  1. import org.apache.hadoop.conf.Configuration;
  2. import org.apache.hadoop.hbase.Cell;
  3. import org.apache.hadoop.hbase.CellUtil;
  4. import org.apache.hadoop.hbase.HBaseConfiguration;
  5. import org.apache.hadoop.hbase.TableName;
  6. import org.apache.hadoop.hbase.client.*;
  7. import org.apache.hadoop.hbase.filter.BinaryComparator;
  8. import org.apache.hadoop.hbase.filter.CompareFilter;
  9. import org.apache.hadoop.hbase.filter.Filter;
  10. import org.apache.hadoop.hbase.filter.RowFilter;
  11. import org.junit.Test;
  12. import java.io.IOException;
  13. import java.util.HashMap;
  14. import java.util.LinkedList;
  15. import java.util.List;
  16. import java.util.Map;
  17. import java.util.concurrent.ExecutionException;
  18. /**
  19. * HBase连接的客户端工具类
  20. */
  21. public class HBaseUtils {
  22. // pool就是存放hbase的连接对象的连接池
  23. private static LinkedList< Connection> pool = new LinkedList<>();
  24. // 初始化5条hbase的连接对象到连接池中
  25. static {
  26. try {
  27. Configuration conf = new Configuration();
  28. conf. set( "hbase.rootdir", "hdfs://hbase1:9000");
  29. conf. set( "hbase.cluster.distributed", "true");
  30. conf. set( "hbase.zookeeper.quorum", "hbase1,hbase2,hbase3");
  31. conf. set( "hbase.regionserver.wal.codec", "org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec");
  32. for (int i = 0;i < 5;i++) {
  33. pool.push( ConnectionFactory.createConnection(conf));
  34. }
  35. } catch ( Exception e) {
  36. e.printStackTrace();
  37. }
  38. }
  39. // 获取连接对象
  40. public static Connection getConnection() {
  41. while (pool.isEmpty()) {
  42. try {
  43. System.out. println( "connection pool is null, please wait for a moment~~~");
  44. Thread.sleep( 1000);
  45. } catch ( Exception e) {
  46. e.printStackTrace();
  47. }
  48. }
  49. return pool.poll();
  50. }
  51. // 释放连接对象,将连接对象归还给连接池
  52. public static void release( Connection connection) {
  53. pool.push(connection);
  54. }
  55. // 根据参数创建表
  56. public static Map< Integer, Long> getColValue( Connection connection, TableName tableName, byte[] rk, byte[] cf) {
  57. //1. 声明map存放最终结果
  58. Map< Integer, Long> partition2Offset = new HashMap<>();
  59. try {
  60. //2. 获取到表对象
  61. Table table = connection.getTable(tableName);
  62. Scan scan = new Scan();
  63. //3. 条件
  64. Filter filter = new RowFilter( CompareFilter. CompareOp. EQUAL, new BinaryComparator(rk));
  65. scan.setFilter( filter);
  66. //4. 创建扫描器
  67. ResultScanner scanner = table.getScanner(scan);
  68. //5. 遍历
  69. for ( Result result : scanner) {
  70. List< Cell> cells = result.listCells(); // 获取到每一个cell(k,v)
  71. for ( Cell cell : cells) {
  72. //col
  73. byte[] column = CellUtil.cloneQualifier(cell);
  74. //value
  75. byte[] values = CellUtil.cloneValue(cell);
  76. int partition = Integer.valueOf(new String(column));
  77. long offset = Long.valueOf(new String(values));
  78. partition2Offset.put( partition, offset);
  79. }
  80. }
  81. return partition2Offset;
  82. } catch ( Exception e) {
  83. e.printStackTrace();
  84. }
  85. return null;
  86. }
  87. //将col和value设置到hbase
  88. public static void set( Connection connection, TableName tableName, byte[] rk, byte[] cf, byte[] col, byte[] value) {
  89. try {
  90. Table table = connection.getTable(tableName);
  91. Put put = new Put(rk);
  92. put.addColumn(cf, col, value);
  93. table.put(put);
  94. table.close();
  95. } catch ( Exception e) {
  96. e.printStackTrace();
  97. }
  98. }
  99. }

2、正式代码:


    
    
  1. import cn.qphone.hbase.utils. HBaseUtils
  2. import cn.qphone.spark.` trait`.LoggerTrait
  3. import cn.qphone.spark.utils.{ CommonUtils, SparkUtils}
  4. import kafka.common. TopicAndPartition
  5. import kafka.message. MessageAndMetadata
  6. import kafka.serializer. StringDecoder
  7. import org.apache.hadoop.hbase. TableName
  8. import org.apache.hadoop.hbase.util. Bytes
  9. import org.apache.spark.streaming. StreamingContext
  10. import org.apache.spark.streaming.dstream. InputDStream
  11. import org.apache.spark.streaming.kafka.{ HasOffsetRanges, KafkaUtils, OffsetRange}
  12. import scala.collection.{ JavaConversions, mutable}
  13. /**
  14. * 一 使用hbase来手动管理offset信息,保证数据被依次消费
  15. * 1. 有:从指定的offset位置开始消费
  16. * 2. 没有:从offset为0开始消费
  17. *
  18. * 二 使用指定的offset向kafka拉取数据
  19. * 三 拉取到数据之后进行业务处理
  20. * 四 offset需要重新更新到hbase
  21. *
  22. * create 'spark-topic-offset', 'cf'
  23. *
  24. * rowkey:topic-group
  25. * column:partition:offset
  26. * */
  27. object Demo7_SparkStreaming_Kafka_HBase extends LoggerTrait{
  28. def main(args: Array[ String]): Unit = {
  29. //1. 入口
  30. val ssc = SparkUtils.getLocalStreamingContext( "Demo6_SparkStreaming_Kafka_Zookeeper", 2)
  31. val kafkaParams = CommonUtils.toMap( "demo6.properties")
  32. val topics = "bjbigdata1909-1".split( ",").toSet
  33. //2. 加载数据
  34. val messages: InputDStream[( String, String)] = createMsg(ssc, kafkaParams, topics)
  35. //3. 遍历消息
  36. messages.foreachRDD((rdd, btime) => {
  37. if (!rdd.isEmpty()) {
  38. //3.1 将偏移量读取到东西打印
  39. println( "-"* 100)
  40. println( s"bTime = ${btime}")
  41. println( "#"* 50 + " " + rdd.count())
  42. //3.2 保存最新的偏移量到zookeeper
  43. storeOffsets(rdd.asInstanceOf[ HasOffsetRanges].offsetRanges, kafkaParams( "group.id"))
  44. }
  45. })
  46. ssc.start()
  47. ssc.awaitTermination()
  48. }
  49. /**
  50. * 从hbase中读取手动保存offset信息,然后从kafka指定offset位置开始读取数据,如果没有读取到offset信息,那么从开始位置开始读取信息
  51. */
  52. def createMsg(ssc: StreamingContext, kafkaParams: Map[ String, String], topics: Set[ String]): InputDStream[( String, String)] = {
  53. //1. 从hbase中读取offset信息
  54. val fromOffsets: Map[ TopicAndPartition, Long] = getFromOffsets(topics, kafkaParams( "group.id"))
  55. //2. 读取外部数据
  56. var messages: InputDStream[( String, String)] = null
  57. //2.1 判断
  58. if (fromOffsets.isEmpty) { // 如果没有读取到偏移量,说明之前从来没有保存过,从开始的位置开始读取
  59. messages = KafkaUtils.createDirectStream[ String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
  60. } else { //读取到了偏移量,从指定位置开始读取
  61. //2.2 创建messageHandler
  62. val messageHandler = (msgHandler: MessageAndMetadata[ String, String]) => (msgHandler.key(), msgHandler.message())
  63. //2.3 读取指定位置的offset的数据
  64. messages = KafkaUtils.createDirectStream[ String, String, StringDecoder, StringDecoder, ( String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
  65. }
  66. messages
  67. }
  68. /**
  69. * 根据主题和消费者组来获取到对应的偏移量
  70. * 首先我们定义了offset保存在hbase:
  71. * 其次,partition中保存了的数据就是我们的offset
  72. */
  73. def getFromOffsets(topics: Set[ String], group: String) : Map[ TopicAndPartition, Long] = {
  74. //1. 定义一个结构专门保存偏移量
  75. val offsets = mutable. Map[ TopicAndPartition, Long]()
  76. //1.1 获取到HBase connection
  77. val connection = HBaseUtils.getConnection
  78. val tableName = TableName.valueOf( "spark-topic-offset")
  79. val cf = Bytes.toBytes( "cf")
  80. //2. 遍历主题
  81. for(topic <- topics) {
  82. //2.1 自定义rowkey
  83. val rk = s"${topic}-${group}".getBytes()
  84. //2.2 获取表的分区以及对应的偏移量
  85. val partition2Offsets = HBaseUtils.getColValue(connection, tableName, rk, cf)
  86. val partition2Offsets2 = JavaConversions.mapAsScalaMap(partition2Offsets)
  87. //2.3 遍历获取分区:还需要将java的数组转换位scala的数组
  88. for ((k, v) <- partition2Offsets2) {
  89. offsets.put( TopicAndPartition(topic, (k+ "").toInt), v)
  90. }
  91. }
  92. HBaseUtils.release(connection)
  93. offsets.toMap
  94. }
  95. /**
  96. * 将偏移量保存会hbase
  97. */
  98. def storeOffsets(offsetRanges: Array[ OffsetRange], group: String) = {
  99. //0.
  100. val connection = HBaseUtils.getConnection
  101. val tableName = TableName.valueOf( "spark-topic-offset")
  102. val cf = Bytes.toBytes( "cf")
  103. //1. 遍历偏移量范围的数组
  104. for(offsetRange <- offsetRanges) {
  105. //2. 获取主题分区以及偏移量
  106. val rk = s"${offsetRange.topic}-${group}".getBytes()
  107. val partition = offsetRange.partition
  108. val untilOffset = offsetRange.untilOffset
  109. //3. 将结果保存到hbase
  110. HBaseUtils.set(connection, tableName, rk, cf, (partition+ "").getBytes(), (untilOffset+ "").getBytes())
  111. }
  112. }
  113. }

三、redis保存


    
    
  1. package sparkStrreaming.day3
  2. import java.util
  3. import kafka.common.TopicAndPartition
  4. import kafka.message.MessageAndMetadata
  5. import kafka.serializer.StringDecoder
  6. import org.apache.spark.SparkConf
  7. import org.apache.spark.streaming.dstream.InputDStream
  8. import org.apache.spark.streaming.{Seconds, StreamingContext}
  9. import utils.CommonUtils
  10. import org.apache.curator.framework.CuratorFrameworkFactory
  11. import org.apache.curator.retry.ExponentialBackoffRetry
  12. import org.apache.spark.streaming.StreamingContext
  13. import org.apache.spark.streaming.dstream.InputDStream
  14. import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
  15. import redis.clients.jedis.Jedis
  16. import scala.collection.{JavaConversions, mutable}
  17. object Demo8_SparkStreaming_Kafka_Redis {
  18. val jedis = new Jedis( "mini1", 6379)
  19. def main(args: Array[ String]): Unit = {
  20. val ssc = new StreamingContext( new SparkConf().setMaster( "local[*]").setAppName( "redis"),Seconds( 2))
  21. val kafkaParams: Map[ String, String] = CommonUtils.toMap( "demo6.properties")
  22. val topics= "test1".split( "\\s+").toSet
  23. val messages: InputDStream[( String, String)] = createMsg(ssc,kafkaParams,topics)
  24. messages.foreachRDD( (rdd,btime)=>{
  25. if(!rdd.isEmpty()){
  26. println( "-"* 100)
  27. println(s "btime=${btime}")
  28. println( "#"* 50)
  29. println(rdd.count())
  30. updateOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges,kafkaParams( "group.id"))
  31. }
  32. })
  33. ssc.start()
  34. ssc.awaitTermination()
  35. }
  36. def createMsg(ssc:StreamingContext,kafkaParams : Map[ String, String],topics: Set[ String]):InputDStream[( String, String)]={
  37. val fromOffsets: Map[TopicAndPartition, Long] = getFromOffsets(topics,kafkaParams( "group.id"))
  38. var messages:InputDStream[( String, String)]= null
  39. if(fromOffsets.isEmpty){
  40. messages = KafkaUtils.createDirectStream[ String, String,StringDecoder,StringDecoder](ssc,kafkaParams,topics)
  41. } else{
  42. val msgHandler= (msgHandler:MessageAndMetadata[String,String])=>(msgHandler.key(),msgHandler.message())
  43. messages=KafkaUtils.createDirectStream[ String, String,StringDecoder,StringDecoder,( String, String)](ssc,kafkaParams,fromOffsets,msgHandler)
  44. }
  45. messages
  46. }
  47. def getFromOffsets(topics: Set[ String], group: String): Map[TopicAndPartition,Long]={
  48. var offsets=mutable.Map[TopicAndPartition,Long]()
  49. for(topic<-topics){
  50. val key=s "${topic}_${group}"
  51. // val str: String = jedis.get(key).toString
  52. val string: util.Map[ String, String] = jedis.hgetAll(key)
  53. for( partition<-JavaConversions.mapAsScalaMap( string)){
  54. offsets.put(TopicAndPartition(topic,partition._1.toInt),partition._2.toLong)
  55. }
  56. }
  57. offsets.toMap
  58. }
  59. def updateOffsets(offsetRanges: Array[OffsetRange], group: String) = {
  60. for(offsetRange<-offsetRanges){
  61. val topic=offsetRange.topic
  62. val partition=offsetRange.partition
  63. val offset=offsetRange.untilOffset
  64. jedis.hset(s "${topic}_${group}",partition.toString,offset.toString)
  65. }
  66. }
  67. }

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值