package com.gc.sparkStreaming.day01
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka.KafkaCluster.Err
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaCluster, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.immutable.HashMap
// 手动维护 kafka 的offset
object KafkaUpdateOffset {
private val group: String ="guochao"
private val brokers="hadoop102:9092,hadoop103:9092,hadoop104:9092"
private val topic: String ="first"
private val kafkaParams = Map(
ConsumerConfig.GROUP_ID_CONFIG->group ,
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG->brokers
)
// kafka 集群管理
private val kafkaCluster:KafkaCluster = new KafkaCluster(kafkaParams)
// 获取偏移量
def getOffSet() :Map[TopicAndPartition,Long]= {
var resMap: Map[TopicAndPartition, Long] =HashMap[TopicAndPartition,Long]()
//获取所有的副本的分区信息和 偏移量
val kafkaClusterMessageEither: Either[Err, Set[TopicAndPartition]] = kafkaCluster.getPartitions(Set(topic))
//Either 的值 右left 和right 的区别 默认 left 是false
kafkaClusterMessageEither match {
case Right(topicAndPartitions) =>{ // 模式匹配
val offsetMap: Either[Err, Map[TopicAndPartition, Long]] = kafkaCluster.getConsumerOffsets(group,topicAndPartitions) //根据主题信息和分区信息获取偏移量
if(offsetMap.isRight){ // 如果有则表示消费过
val map: Map[TopicAndPartition, Long] = offsetMap.right.get
resMap++:=map
}else{ //没有消费过则初始化全部分区的偏移量为0
topicAndPartitions.foreach(t=>{
resMap += t->0L
})
}
}
case _=>
}
resMap
}
// 手动维护分区信息
def updateOffset(ds: InputDStream[String]) = {
// 每次遍历 都需要维护offset 所以需在 遍历Rdd的时候 维护offset 实现最多一次的语义 offset 维护在kafka 的默认topic 中
ds.foreachRDD(rdd=>{
var map: Map[TopicAndPartition, Long] = new HashMap[TopicAndPartition,Long]()
val hasOffsetRanges: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]
val ranges: Array[OffsetRange] = hasOffsetRanges.offsetRanges
// println(ranges.mkString(","))
// 循环遍历 求出每个分区的偏移量的上限值
ranges.foreach(offsetRange=>{
map += offsetRange.topicAndPartition()->offsetRange.untilOffset
})
kafkaCluster.setConsumerOffsets(group,map);
})
//维护offset
}
def main(args: Array[String]): Unit =
{
val conf = new SparkConf().setMaster("local[2]").setAppName("KafkaUpdateOffset")
val streamingContext: StreamingContext = new StreamingContext(conf,Seconds(3))
// def createDirectStream[
// K: ClassTag,
// V: ClassTag,
// KD <: Decoder[K]: ClassTag,
// VD <: Decoder[V]: ClassTag,
// R: ClassTag] (
// ssc: StreamingContext,
// kafkaParams: Map[String, String],
// fromOffsets: Map[TopicAndPartition, Long],
// messageHandler: MessageAndMetadata[K, V] => R
// ): InputDStream[R] = {
// val cleanedHandler = ssc.sc.clean(messageHandler)
// new DirectKafkaInputDStream[K, V, KD, VD, R](
// ssc, kafkaParams, fromOffsets, cleanedHandler)
// }
val ds = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, String](
streamingContext,
kafkaParams,
getOffSet(), // 读取offset
(message: MessageAndMetadata[String, String]) => message.message()
)
// 手动维护Offset
updateOffset(ds)
val resDs: DStream[(String, Int)] = ds.flatMap(_.split("\\W+")).map((_,1)).reduceByKey(_+_)
resDs.print(100)
streamingContext.start()
streamingContext.awaitTermination()
}
}