import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.Decoder
import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaCluster, KafkaUtils}
import scala.reflect.ClassTag
class MyKafkaClient(val kafkaParams: Map[String, String]) extends Serializable {
private val kc = new KafkaCluster(kafkaParams)
def createDirectStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](ssc: StreamingContext, topic: String): InputDStream[(K, V)] = {
val partitionsEither = kc.getPartitions(Set(topic))
if (partitionsEither.isLeft) throw new SparkException(s"get kafka partition failed: ${partitionsEither.left.get}")
val partitions = partitionsEither.right.get
val groupId = kafkaParams.get("group.id").get
val offsets = setOrUpdateOffsets(partitions, groupId)
KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)](ssc, kafkaParams, offsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message))
}
private def setOrUpdateOffsets(partitions: Set[TopicAndPartition], groupId: String): Map[TopicAndPartition, Long] = {
val consumerOffsetEither = kc.getConsumerOffsets(groupId, partitions)
if (consumerOffsetEither.isLeft) {
val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
var leaderOffsets: Map[TopicAndPartition, LeaderOffset] = null
if (reset == Some("smallest")) {
val leaderOffsetsEither = kc.getEarliestLeaderOffsets(partitions)
if (leaderOffsetsEither.isLeft) throw new SparkException(s"get earliest leader offsets failed: ${leaderOffsetsEither.left.get}")
leaderOffsets = leaderOffsetsEither.right.get
} else {
val leaderOffsetsEither = kc.getLatestLeaderOffsets(partitions)
if (leaderOffsetsEither.isLeft) throw new SparkException(s"get latest leader offsets failed: ${leaderOffsetsEither.left.get}")
leaderOffsets = leaderOffsetsEither.right.get
}
val offsets = leaderOffsets.map {
case (tp, offset) => (tp, offset.offset)
}
kc.setConsumerOffsets(groupId, offsets)
offsets
} else {
val earliestLeaderOffsetsEither = kc.getEarliestLeaderOffsets(partitions)
if (earliestLeaderOffsetsEither.isLeft) throw new SparkException(s"get earliest leader offsets failed: ${earliestLeaderOffsetsEither.left.get}")
val earliestLeaderOffsets = earliestLeaderOffsetsEither.right.get
val consumerOffsets = consumerOffsetEither.right.get
var offsets: Map[TopicAndPartition, Long] = Map()
consumerOffsets.foreach({ case (tp, n) =>
val earliestLeaderOffset = earliestLeaderOffsets(tp).offset
if (n < earliestLeaderOffset) {
offsets += (tp -> earliestLeaderOffset)
} else {
offsets += (tp -> n)
}
})
if (!offsets.isEmpty) {
kc.setConsumerOffsets(groupId, offsets)
}
offsets
}
/**
* 如果streaming程序执行的时候出现kafka.common.OffsetOutOfRangeException,
* 说明zk上保存的offsets已经过时了,即kafka的定时清理策略已经将包含该offsets的文件删除。
* 针对这种情况,只要判断一下zk上的consumerOffsets和earliestLeaderOffsets的大小,
* 如果consumerOffsets比earliestLeaderOffsets还小的话,说明consumerOffsets已过时,
* 这时把consumerOffsets更新为earliestLeaderOffsets
*/
}
def updateOffsets(rdd: RDD[(String, String)]): Unit = {
val groupId = kafkaParams.get("group.id").get
val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
for (offsets <- offsetsList) {
val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition)
val o = kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsets.untilOffset)))
if (o.isLeft) {
println(s"Error updating the offset to Kafka cluster: ${o.left.get}")
}
}
}
}