- maven依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>2.1.1</version>
</dependency>
2.直接上代码
object BIStreamingZK {
def main(args: Array[String]): Unit = {
logger.setLevel(Level.WARN)
val sparkConf=new SparkConf().setAppName("bi_stream_analyse_zk")
//开启背压机制
sparkConf.set("spark.streaming.backpressure.enabled","true")
//max=partiions*5*100=1500
sparkConf.set("spark.streaming.kafka.maxRatePerPartition","100")
val ssc = new StreamingContext(sparkConf,Seconds(5))
//topics
val topics = Set("topic1","topic2")
//groupid
val groupId="bi_stream_analyse_bill_state_zk"
val kfkParams = Map[String, String](
"zookeeper.connect"->"setver1:21810,server2:21810,server3:21810",
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "kafkaserver1:9092,kafkaserver2:9092,kafkaserver3:9092",
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "largest",
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG->"org.apache.kafka.common.serialization.StringSerializer",
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG->"org.apache.kafka.common.serialization.StringSerializer",
ConsumerConfig.GROUP_ID_CONFIG -> groupId
)
//创建KafkaCluster(维护offset)
val kafkaCluster = new KafkaCluster(kfkParams)
//获取offset
val fromOffset:Map[TopicAndPartition,Long]=getOffset(kafkaCluster,topics,groupId)
//创建DStream
val kafkaDStream: InputDStream[String] = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder,String](ssc,
kfkParams,
fromOffset,
(message: MessageAndMetadata[String, String]) => message.message())
//处理业务
kafkaDStream.foreachRDD( rdd => {
if (!rdd.isEmpty()) {
dealRdd(rdd)
}
})
//保存Offset
setOffset(kafkaCluster,kafkaDStream,groupId)
ssc.start()
ssc.awaitTermination()
}
def getOffset(kafkaCluster: KafkaCluster, topics: Set[String], groupId: String): Map[TopicAndPartition, Long] = {
var partitionToLong = new HashMap[TopicAndPartition,Long]
//获取每个分区
val topicAndPartionsEither: Either[Err, Set[TopicAndPartition]] = kafkaCluster.getPartitions(topics)
//topic存在
if (topicAndPartionsEither.isRight) {
val topicAndPartions: Set[TopicAndPartition] = topicAndPartionsEither.right.get
val topicAndPartionToLongEither: Either[Err, Map[TopicAndPartition, Long]] = kafkaCluster.getConsumerOffsets(groupId,topicAndPartions)
if (topicAndPartionToLongEither.isLeft) {
//offset从未消费:此处offset置为0,但是kafkaoffset最小过期大于0就会异常,后面处理
for(topicAndPartion<-topicAndPartions){
partitionToLong += (topicAndPartion -> 0L)
}
}else{
//offset消费过,超过kafka七天保存会异常
val value: Map[TopicAndPartition, Long] = topicAndPartionToLongEither.right.get
partitionToLong++=value
}
}
partitionToLong
}
def setOffset(kafkaCluster: KafkaCluster, kafkaDstream: InputDStream[String], groupId: String): Unit = {
kafkaDstream.foreachRDD(rdd=>{
var partitionToLong = new HashMap[TopicAndPartition,Long]
val offSetranges: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]
val ranges: Array[OffsetRange] = offSetranges.offsetRanges
for(range<-ranges){
partitionToLong+=(range.topicAndPartition()->range.untilOffset)
}
kafkaCluster.setConsumerOffsets(groupId,partitionToLong)
})
}
}