package cn._51doit.spark.day13
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
object DStreamOffsetTest1 {
def main(args: Array[String]): Unit = {
//实时计算创建StreamingContext,(StreamingContext是对SparkContext的增强包装,里面持有者SparkContext的引用)
val conf = new SparkConf().setAppName("StreamingWordCount").setMaster("local[*]")
val ssc: StreamingContext = new StreamingContext(conf, Seconds(5))
ssc.sparkContext.setLogLevel("WARN")
//Kafka相关的参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "node-1.51doit.com:9092,node-2.51doit.com:9092,node-3.51doit.com:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "g03",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean) //让消费者不用自动提交偏移量
)
val topics = Array("wordcount")
//sparkStreaming跟Kafka整合,使用的是官方推荐的直连方式,使用Kafka底层的消费API,效率更高
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc, //传入StreamingContext
LocationStrategies.PreferConsistent, //位置策略
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) //消费策略
)
//DirectKafkaInputDStream中持有KafkaRDD,只有KafkaRDD有偏移量
//val lines = kafkaDStream.map(_.value())
//将KafkaRDD转换成了MapPartitionsRDD
// lines.foreachRDD(rdd => {
//
// val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//
// for (elem <- offsetRanges) {
//
// println(elem.untilOffset)
//
// }
// })
//在Driver
var offsetRanges: Array[OffsetRange] = null
//如果偏偏向调用DStream中的特殊方法,而且还想获取偏移量,例如:updateStateByKey,window
val transformedRDD: DStream[ConsumerRecord[String, String]] = kafkaDStream.transform(rdd => {
//获取偏移量(Driver)
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
})
//实现一个实时的WordCount
//Transformation开始
val words: DStream[String] = transformedRDD.map(_.value()).flatMap(_.split(" "))
val wordAndOne: DStream[(String, Int)] = words.map((_, 1))
//reduceByKey只对当前批次进行累加,不会累加历史批次
//val reduced: DStream[(String, Int)] = wordAndOne.reduceByKey(_ + _)
//要累加历史中间结果(State)
//updateFunc: (Seq[V], Option[S]) => Option[S]
//第一个参数:同一个批次中,相同key的value
//第二个参数:对应key的value的初始值或中间结果
//返回值:当前批次的数据和state计算后的结果
val reduced: DStream[(String, Int)] = wordAndOne.updateStateByKey((s: Seq[Int], o: Option[Int]) => {
Some(s.sum + o.getOrElse(0))
})
//Transformation结束
reduced.foreachRDD(rdd => {
//保存到MySQL
rdd.foreachPartition(it => {
//创建连接
})
//更新偏移量,使用第一手的DStream更新骗了
kafkaDStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
})
//开启
ssc.start()
//让程序一直运行,将Driver挂起
ssc.awaitTermination()
}
}