Kafka-Straming-redis(offset)

7 篇文章 0 订阅
3 篇文章 0 订阅
本文探讨了如何在Kafka Streaming应用中结合Redis来管理消费位点(offset),详细阐述了设置和检索offset的过程,以及这种集成在实时数据处理中的优势。
摘要由CSDN通过智能技术生成
package kafka2

import java.util

import org.apache.kafka.common.TopicPartition
import utils.Jpools
import scala.collection.mutable._


/**
  * 获取redis里面存储的偏移量数据
  */
object RedisOffset {
  def apply(groupid:String)= {
    val formRedisOffset = Map[TopicPartition, Long]()
    val jedis = Jpools.getJedis
    val tpOffset: util.Map[String, String] = jedis.hgetAll(groupid)
    import scala.collection.JavaConversions._
    val tpOffsetList: List[(String, String)] = tpOffset.toList
    for (topicPL <- tpOffsetList) {
      val split = topicPL._1.split("[:]")
      formRedisOffset += (new TopicPartition(split(0), split(1).toInt) -> topicPL._2.toLong)
    }
    formRedisOffset
  }
}
package utils

import org.apache.commons.pool2.impl.GenericObjectPoolConfig
import redis.clients.jedis.{Jedis, JedisPool}

/**
  * 创建jedis连接池
  */
object Jpools {
   private val poolConfig = new GenericObjectPoolConfig()
  poolConfig.setMaxIdle(5)//最大的空闲连接数
  poolConfig.setMaxTotal(2000)//支持最大的连接数
  //连接池不需要对外提供访问
  private lazy val jedisPool = new JedisPool(poolConfig,"hadoop01")

  /**
    * 对外提供一个可以从池子里面获取连接的方法
    * @return
    */
  def getJedis :Jedis={
    val jedis = jedisPool.getResource
    jedis.select(2)
    jedis
  }
}

 

package kafka2

import java.lang

import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import utils.Jpools

import scala.collection.mutable

/**
  * 从kafka消费数据到redis,并且由redis管理偏移量
  */
object KafkaAndStreamingToRedis {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("KafkaAndStreamingToRedis").setMaster("local[*]")
    //spark streaming程序参数设置
    conf.set("spark.streaming.backpressure.enabled","true")
    conf.set("spark.streaming.stopGracefullyOnShutdown","true")//优雅停止
    conf.set("spark.streaming.kafka.maxRatePerPartition","5")//3*5*2

    val ssc = new StreamingContext(conf,Seconds(2))
    val groupid="day01_01"

    //封装kafka的连接相关参数.给consumer使用
    val kafkaParams: Map[String, Object] = Map[String, Object](
      "bootstrap.servers" -> "hadoop01:9092,hadoop02:9092,hadoop03:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupid,
      "auto.offset.reset" -> "earliest",
      //"auto.commit.interval.ms"-> "1000",设置为1秒提交一次offset,默认是5秒
      "enable.auto.commit" -> (false: lang.Boolean) //是否自动递交偏移量
    )
    //指定主题
    val topic="wordcount"
    val topics= Array(topic)

    val offsetManage: mutable.Map[TopicPartition, Long] = RedisOffset(groupid)
    val stream = if(offsetManage.size >0){
      KafkaUtils.createDirectStream(
        ssc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe(topics,kafkaParams,offsetManage)
      )
    }else{
      KafkaUtils.createDirectStream[String, String](
        ssc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
      )
    }
    stream.foreachRDD(rdd=>{
      val jedis =Jpools.getJedis
      val offsetRanage = rdd.asInstanceOf[HasOffsetRanges].offsetRanges

      val reduced: Array[(String, Int)] = rdd.map(crv=>(crv.value(),1)).reduceByKey(_+_).collect()
      //返回一个事务控制对象(必须要在同一端才能进行事务)
      val transaction = jedis.multi()
      try {
        for (tp <- reduced) {
          jedis.hincrBy("wordcount", tp._1, tp._2)
        }
        for (o <- offsetRanage) {
          //存偏移量
          jedis.hset(groupid, o.topic + ":" + o.partition, o.untilOffset.toString)
        }
        transaction.exec()
      }catch{
        case _ => println("您报错了,需要回滚")
          transaction.discard()
      }
        jedis.close()
      //需求:将kafka中的字符串进行计数统计写入到redis
      /**
        * 1.先进行reduceBykey
        * 2.foreachPartition.创建redis连接
        * 获取redis连接,foreach遍历partition中的每条数据写入到redis
        * 问题:可能当数据存入时,要存偏移量的时候出现宕机,这是需要用事务控制
        */
  /*    rdd.map(crv=>(crv.value(),1)).reduceByKey(_+_).foreachPartition(partition=>{
        val jedis = Jpools.getJedis
        partition.foreach(tp=>{
          jedis.hincrBy("wordcount",tp._1,tp._2)
        })
        jedis.close()
      })*/

    })

  ssc.start()
  ssc.awaitTermination()
  }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值