import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaCluster.Err
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaCluster, KafkaUtils, OffsetRange}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object WordCount3 {
val topics = Set("wordcount")
val groupId = "wc"
private val params: Map[String, String] = Map[String, String](
"bootstrap.servers" -> "node-01:9092,node-02:9092,node-03:9092",
"groug.id" -> groupId
)
//KafkaCluster,可以完成保存与读offset
private val cluster = new KafkaCluster(params)
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount1")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val ssc = new StreamingContext(sc, Seconds(3))
val sourceStream: InputDStream[String] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, String](
ssc,
params,
readOffsets,
(handler: MessageAndMetadata[String, String]) => handler.message()
)
sourceStream.flatMap(_.split("\\W+")).map((_, 1)).reduceByKey(_ + _).print(1000)
//保存偏移量
saveOffsets(sourceStream)
ssc.start()
ssc.awaitTermination()
}
/**
* 从kafka中读取偏移量
*
* @return
*/
def readOffsets = {
var resultMap = Map[TopicAndPartition, Long]()
//1、获取这些topic的所有分区
val topicAndPartitionSetEither: Either[Err, Set[TopicAndPartition]] = cluster.getPartitions(topics)
topicAndPartitionSetEither match {
//2、获取topics和分区的信息
case Right(topicAndPartitionSet) => {
//3、获取到分区信息和offset
val topicAndPartitionToLongEither: Either[Err, Map[TopicAndPartition, Long]] = cluster.getConsumerOffsets(groupId, topicAndPartitionSet)
topicAndPartitionToLongEither match {
//表示每个topic的每个分区都已经存储过偏移量
case Right(topicAndPartitionMap) => {
resultMap ++= topicAndPartitionMap
}
//表示这个topic的分区是第一次消费
case _ => {
topicAndPartitionSet.foreach(topicAndPartition => {
resultMap += (topicAndPartition -> 0L)
})
}
}
}
case _ => //表示不存在任何topic
}
resultMap
}
/**
* 保存偏移量
*
* @param sourceStream
*/
def saveOffsets(sourceStream: InputDStream[String]) = {
//每个批次执行一次传递过去的函数
sourceStream.foreachRDD(rdd => {
var map: Map[TopicAndPartition, Long] = Map[TopicAndPartition, Long]()
//包含了offset的信息
val offsetRanges: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]
val ranges: Array[OffsetRange] = offsetRanges.offsetRanges
ranges.foreach(offsetRange => {
val offset: Long = offsetRange.untilOffset
map += (offsetRange.topicAndPartition() -> offset)
})
cluster.setConsumerOffsets(groupId, map)
})
}
}
SparkStreaming简易实现Kafka偏移量保存
最新推荐文章于 2022-10-12 22:42:41 发布