Spark createDirectStream 维护 Kafka offset(Scala)

转自:https://www.cnblogs.com/zhangtianyuan/p/8483082.html

createDirectStream方式需要自己维护offset,使程序可以实现中断后从中断处继续消费数据。

KafkaManager.scala

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import  kafka.common.TopicAndPartition
import  kafka.message.MessageAndMetadata
import  kafka.serializer.Decoder
import  org.apache.spark.SparkException
import  org.apache.spark.rdd.RDD
import  org.apache.spark.streaming.StreamingContext
import  org.apache.spark.streaming.dstream.InputDStream
import  org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset
 
import  scala.reflect.ClassTag
 
/**
   * Created by knowpigxia on 15-8-5.
   */
class  KafkaManager( val  kafkaParams :  Map[String, String])  extends  Serializable {
 
   private  val  kc  =  new  KafkaCluster(kafkaParams)
 
   /**
     * 创建数据流
     * @param ssc
     * @param kafkaParams
     * @param topics
     * @tparam K
     * @tparam V
     * @tparam KD
     * @tparam VD
     * @return
     */
   def  createDirectStream[K :  ClassTag, V :  ClassTag, KD < :  Decoder[K] :  ClassTag, VD < :  Decoder[V] :  ClassTag](
                                                                                                             ssc :  StreamingContext,
                                                                                                             kafkaParams :  Map[String, String],
                                                                                                             topics :  Set[String]) :  InputDStream[(K, V)]  =   {
     val  groupId  =  kafkaParams.get( "group.id" ).get
     // 在zookeeper上读取offsets前先根据实际情况更新offsets
     setOrUpdateOffsets(topics, groupId)
 
     //从zookeeper上读取offset开始消费message
     val  messages  =  {
       val  partitionsE  =  kc.getPartitions(topics)
       if  (partitionsE.isLeft)
         throw  new  SparkException(s "get kafka partition failed: ${partitionsE.left.get}" )
       val  partitions  =  partitionsE.right.get
       val  consumerOffsetsE  =  kc.getConsumerOffsets(groupId, partitions)
       if  (consumerOffsetsE.isLeft)
         throw  new  SparkException(s "get kafka consumer offsets failed: ${consumerOffsetsE.left.get}" )
       val  consumerOffsets  =  consumerOffsetsE.right.get
       KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)](
         ssc, kafkaParams, consumerOffsets, (mmd :  MessageAndMetadata[K, V])  = > (mmd.key, mmd.message))
     }
     messages
   }
 
   /**
     * 创建数据流前,根据实际消费情况更新消费offsets
     * @param topics
     * @param groupId
     */
   private  def  setOrUpdateOffsets(topics :  Set[String], groupId :  String) :  Unit  =  {
     topics.foreach(topic  = > {
       var  hasConsumed  =  true
       val  partitionsE  =  kc.getPartitions(Set(topic))
       if  (partitionsE.isLeft)
         throw  new  SparkException(s "get kafka partition failed: ${partitionsE.left.get}" )
       val  partitions  =  partitionsE.right.get
       val  consumerOffsetsE  =  kc.getConsumerOffsets(groupId, partitions)
       if  (consumerOffsetsE.isLeft) hasConsumed  =  false
       if  (hasConsumed) { // 消费过
         /**
           * 如果streaming程序执行的时候出现kafka.common.OffsetOutOfRangeException,
           * 说明zk上保存的offsets已经过时了,即kafka的定时清理策略已经将包含该offsets的文件删除。
           * 针对这种情况,只要判断一下zk上的consumerOffsets和earliestLeaderOffsets的大小,
           * 如果consumerOffsets比earliestLeaderOffsets还小的话,说明consumerOffsets已过时,
           * 这时把consumerOffsets更新为earliestLeaderOffsets
           */
         val  earliestLeaderOffsetsE  =  kc.getEarliestLeaderOffsets(partitions)
         if  (earliestLeaderOffsetsE.isLeft)
           throw  new  SparkException(s "get earliest leader offsets failed: ${earliestLeaderOffsetsE.left.get}" )
         val  earliestLeaderOffsets  =  earliestLeaderOffsetsE.right.get
         val  consumerOffsets  =  consumerOffsetsE.right.get
 
         // 可能只是存在部分分区consumerOffsets过时,所以只更新过时分区的consumerOffsets为earliestLeaderOffsets
         var  offsets :  Map[TopicAndPartition, Long]  =  Map()
         consumerOffsets.foreach({  case (tp, n)  = >
           val  earliestLeaderOffset  =  earliestLeaderOffsets(tp).offset
           if  (n < earliestLeaderOffset) {
             println( "consumer group:"  + groupId +  ",topic:"  + tp.topic +  ",partition:"  + tp.partition +
               " offsets已经过时,更新为"  + earliestLeaderOffset)
             offsets + =  (tp -> earliestLeaderOffset)
           }
         })
         if  (!offsets.isEmpty) {
           kc.setConsumerOffsets(groupId, offsets)
         }
       else  { // 没有消费过
       val  reset  =  kafkaParams.get( "auto.offset.reset" ).map( _ .toLowerCase)
         var  leaderOffsets :  Map[TopicAndPartition, LeaderOffset]  =  null
         if  (reset  ==  Some( "smallest" )) {
           val  leaderOffsetsE  =  kc.getEarliestLeaderOffsets(partitions)
           if  (leaderOffsetsE.isLeft)
             throw  new  SparkException(s "get earliest leader offsets failed: ${leaderOffsetsE.left.get}" )
           leaderOffsets  =  leaderOffsetsE.right.get
         else  {
           val  leaderOffsetsE  =  kc.getLatestLeaderOffsets(partitions)
           if  (leaderOffsetsE.isLeft)
             throw  new  SparkException(s "get latest leader offsets failed: ${leaderOffsetsE.left.get}" )
           leaderOffsets  =  leaderOffsetsE.right.get
         }
         val  offsets  =  leaderOffsets.map {
           case  (tp, offset)  = > (tp, offset.offset)
         }
         kc.setConsumerOffsets(groupId, offsets)
       }
     })
   }
 
   /**
     * 更新zookeeper上的消费offsets
     * @param rdd
     */
   def  updateZKOffsets(rdd :  RDD[(String, String)])  :  Unit  =  {
     val  groupId  =  kafkaParams.get( "group.id" ).get
     val  offsetsList  =  rdd.asInstanceOf[HasOffsetRanges].offsetRanges
 
     for  (offsets <- offsetsList) {
       val  topicAndPartition  =  TopicAndPartition(offsets.topic, offsets.partition)
       val  =  kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsets.untilOffset)))
       if  (o.isLeft) {
         println(s "Error updating the offset to Kafka cluster: ${o.left.get}" )
       }
     }
   }
}

  主程序中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def  initKafkaParams  =  {
     Map[String, String](
       "metadata.broker.list"  -> Constants.KAFKA _ BROKERS,
       "group.id "  -> Constants.KAFKA _ CONSUMER _ GROUP,
       "fetch.message.max.bytes"  ->  "20971520" ,
       "auto.offset.reset"  ->  "smallest"
     )
   }
 
// kafka参数
val  kafkaParams  =  initKafkaParams
val  manager  =  new  KafkaManager(kafkaParams)
val  messageDstream  =  manager.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, Set(topic))
 
// 更新offsets
manager.updateZKOffsets(rdd)

  

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值