import java.lang
import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.I0Itec.zkclient.ZkClient
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext, kafka010}
import scala.collection.mutable._
//ZK管理偏移量
object SSCDirectKafka010_ZK_Offset {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("SSCDirectKafka010_ZK_Offset")
//配置在kafka中每次拉取的数据量,这里配置的2并不是每次在kafka拉取2条数据,而是:2*分区数量*采样时间(12)
conf.set("spark.streaming.kafka.maxRatePerPartition", "2")
//是否优雅的停止你的SparkStreaming,如果不加这个参数的话,服务停止的时候可能会造成数据的丢失
conf.set("spark.streaming.stopGracefullyOnShutdown", "true")
val ssc = new StreamingContext(conf,Seconds(2))
//消费者ID
val groupId = "day11_08"
//配置消费者参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hadoop01:9092,hadoop02:9092,hadoop03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
//从头的数据开始消费earliest
"auto.offset.reset" -> "earliest",
//是否自动提交偏移量
"enable.auto.commit" -> (false: lang.Boolean)
)
val topic = "helloTopic"
val topics = Array(topic)
val zkTopicDirs: ZKGroupTopicDirs = new ZKGroupTopicDirs(groupId,topic)
//ZK储存offset的目录
val offsetDir: String = zkTopicDirs.consumerOffsetDir
//创建一个zkClient的客户端连接
val zkClient = new ZkClient("hadoop01:2181,hadoop02:2181,hadoop01:2183")
//获取子目录下的文件数量
val childrenCount = zkClient.countChildren(offsetDir)
//如果有文件就读去偏移量
val result = if(childrenCount > 0){
val offsetResult = Map[TopicPartition,Long]()
(0 until childrenCount).foreach(f = part => {
val offset: String = zkClient.readData[String](offsetDir + s"/${part}")
offsetResult += (new TopicPartition(topic, part) -> offset.toLong)
})
KafkaUtils.createDirectStream[String,String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](topics,kafkaParams,offsetResult)
)
//没用则从头开始读
}else{
KafkaUtils.createDirectStream[String,String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](topics,kafkaParams)
)
}
result.foreachRDD(rdd=>{
val ranges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
for (i <- ranges){
println(i.topic+"-"+i.partition+"-"+i.untilOffset+"-"+topic)
//将偏移量写入zookeeper上
ZkUtils(zkClient,false).updateEphemeralPath(offsetDir+"/"+i.partition,i.untilOffset.toString)
}
})
ssc.start()
ssc.awaitTermination()
}
}
ZK管理kafka偏移量
最新推荐文章于 2024-05-18 00:45:00 发布