package com.kafka.wordcount import java.util import kafka.serializer.StringDecoder import org.apache.spark.storage.StorageLevel import org.apache.spark.{HashPartitioner, SparkConf, TaskContext} import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange} import org.apache.spark.streaming.{Seconds, StreamingContext} import scala.collection.JavaConverters._ /** * Created by root on 2016/5/21. */ object KafkaWordCount { // // val updateFunc = (iter: Iterator[(String, Seq[Int], Option[Int])]) => { // //iter.flatMap(it=>Some(it._2.sum + it._3.getOrElse(0)).map(x=>(it._1,x))) // iter.flatMap { case (x, y, z) => Some(y.sum + z.getOrElse(0)).map(i => (x, i)) } // } // // val arr = new util.ArrayList[String]() def main(args: Array[String]) { LoggerLevels.setStreamingLogLevels() // val Array(zkQuorum, group, topics, numThreads) = args val sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[*]") val ssc = new StreamingContext(sparkConf, Seconds(5)) val kafkaParams = Map[String, String]( "bootstrap.servers" -> "m01:9092,m02:9092,m03:9092" ) //与kafka建立连接 // ssc.checkpoint("") val TopicsSet = "topic_test1".split(",").toSet val data = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( ssc, kafkaParams, TopicsSet) val data2 = data.map(msg=>{ msg._1+msg._2 }) data2.foreachRDD { rdd => val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges rdd.foreachPartition { iter => val o: OffsetRange = offsetRanges(TaskContext.get.partitionId) println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}") } } ssc.start() ssc.awaitTermination() } }
kafka+sparkstreaming 获取每个分区的偏移范围
最新推荐文章于 2020-10-11 21:06:15 发布