spark 精确一次消费(Exactly-once)kafka数据
精确一次消费(Exactly-once) 是指消息一定会被处理且只会被处理一次。不多不少就一次处理。同时解决了数据丢失和数据重复的问题
解决思路:手动维护偏移量offset(不丢失)+幂等处理(不重复)
实现步骤:
1.设置kafka偏移量提交为手动提交
"enable.auto.commit" -> (false: java.lang.Boolean)
2.将offset存入redis的hash结构中
//从Redis中获取偏移量
def getOffset(topicName:String,groupId:String): Map[TopicPartition,Long] ={
//获取jedis连接
val jedis = MyRedisUtil.getJedisClient()
val offsetKey = "offset:"+topicName+":"+groupId
val offsetMap = jedis.hgetAll(offsetKey)
//关闭jedis连接
jedis.close()
import scala.collection.JavaConverters._
offsetMap.asScala.map{
case (partitionId,offset)=>{
(new TopicPartition(topicName,partitionId.toInt),offset.toLong)
}
}.toMap
}
//向Redis中保存偏移量
def saveOffset(topicName: String, groupId: String, offsetRanges: Array[OffsetRange]): Unit = {
// HashMap[ partition , offset ]
val offsetMap = new util.HashMap[String,String]()
for(offsetRange <- offsetRanges){
val partitionId = offsetRange.partition
val unitlOffset = offsetRange.untilOffset
offsetMap.put(partitionId.toString,unitlOffset.toString)
}
val offsetKey = "offset:"+topicName+":"+groupId
//获取jedis连接
val jedis = MyRedisUtil.getJedisClient()
//写入redis
jedis.hmset(offsetKey,offsetMap)
//关闭jedis连接
jedis.close()
}
3.存入es中,es具有幂等性