package kafka2
import java.util
import org.apache.kafka.common.TopicPartition
import utils.Jpools
import scala.collection.mutable._
/**
* 获取redis里面存储的偏移量数据
*/
object RedisOffset {
def apply(groupid:String)= {
val formRedisOffset = Map[TopicPartition, Long]()
val jedis = Jpools.getJedis
val tpOffset: util.Map[String, String] = jedis.hgetAll(groupid)
import scala.collection.JavaConversions._
val tpOffsetList: List[(String, String)] = tpOffset.toList
for (topicPL <- tpOffsetList) {
val split = topicPL._1.split("[:]")
formRedisOffset += (new TopicPartition(split(0), split(1).toInt) -> topicPL._2.toLong)
}
formRedisOffset
}
}
package utils
import org.apache.commons.pool2.impl.GenericObjectPoolConfig
import redis.clients.jedis.{Jedis, JedisPool}
/**
* 创建jedis连接池
*/
object Jpools {
private val poolConfig = new GenericObjectPoolConfig()
poolConfig.setMaxIdle(5)//最大的空闲连接数
poolConfig.setMaxTotal(2000)//支持最大的连接数
//连接池不需要对外提供访问
private lazy val jedisPool = new JedisPool(poolConfig,"hadoop01")
/**
* 对外提供一个可以从池子里面获取连接的方法
* @return
*/
def getJedis :Jedis={
val jedis = jedisPool.getResource
jedis.select(2)
jedis
}
}
package kafka2
import java.lang
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import utils.Jpools
import scala.collection.mutable
/**
* 从kafka消费数据到redis,并且由redis管理偏移量
*/
object KafkaAndStreamingToRedis {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("KafkaAndStreamingToRedis").setMaster("local[*]")
//spark streaming程序参数设置
conf.set("spark.streaming.backpressure.enabled","true")
conf.set("spark.streaming.stopGracefullyOnShutdown","true")//优雅停止
conf.set("spark.streaming.kafka.maxRatePerPartition","5")//3*5*2
val ssc = new StreamingContext(conf,Seconds(2))
val groupid="day01_01"
//封装kafka的连接相关参数.给consumer使用
val kafkaParams: Map[String, Object] = Map[String, Object](
"bootstrap.servers" -> "hadoop01:9092,hadoop02:9092,hadoop03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupid,
"auto.offset.reset" -> "earliest",
//"auto.commit.interval.ms"-> "1000",设置为1秒提交一次offset,默认是5秒
"enable.auto.commit" -> (false: lang.Boolean) //是否自动递交偏移量
)
//指定主题
val topic="wordcount"
val topics= Array(topic)
val offsetManage: mutable.Map[TopicPartition, Long] = RedisOffset(groupid)
val stream = if(offsetManage.size >0){
KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(topics,kafkaParams,offsetManage)
)
}else{
KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
)
}
stream.foreachRDD(rdd=>{
val jedis =Jpools.getJedis
val offsetRanage = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
val reduced: Array[(String, Int)] = rdd.map(crv=>(crv.value(),1)).reduceByKey(_+_).collect()
//返回一个事务控制对象(必须要在同一端才能进行事务)
val transaction = jedis.multi()
try {
for (tp <- reduced) {
jedis.hincrBy("wordcount", tp._1, tp._2)
}
for (o <- offsetRanage) {
//存偏移量
jedis.hset(groupid, o.topic + ":" + o.partition, o.untilOffset.toString)
}
transaction.exec()
}catch{
case _ => println("您报错了,需要回滚")
transaction.discard()
}
jedis.close()
//需求:将kafka中的字符串进行计数统计写入到redis
/**
* 1.先进行reduceBykey
* 2.foreachPartition.创建redis连接
* 获取redis连接,foreach遍历partition中的每条数据写入到redis
* 问题:可能当数据存入时,要存偏移量的时候出现宕机,这是需要用事务控制
*/
/* rdd.map(crv=>(crv.value(),1)).reduceByKey(_+_).foreachPartition(partition=>{
val jedis = Jpools.getJedis
partition.foreach(tp=>{
jedis.hincrBy("wordcount",tp._1,tp._2)
})
jedis.close()
})*/
})
ssc.start()
ssc.awaitTermination()
}
}