Spark连接Kafka

10.1    示例代码
object DirectKafkaWordCount {
      def main(args: Array[String]) {
        if (args.length < 2) {
          System.err.println(s"""
            |Usage: DirectKafkaWordCount <brokers> <topics>
            |  <brokers> is a list of one or more Kafka brokers
            |  <topics> is a list of one or more kafka topics to consume from
            |
            """.stripMargin)
          System.exit(1)
        }
    
        StreamingExamples.setStreamingLogLevels()
    
        val Array(brokers, topics) = args
    
        // Create context with 2 second batch interval
        val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount")
        val ssc = new StreamingContext(sparkConf, Seconds(2))
    
        // Create direct kafka stream with brokers and topics
        val topicsSet = topics.split(",").toSet
        val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
        val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
          ssc, kafkaParams, topicsSet)
    
        // Get the lines, split them into words, count the words and print
        val lines = messages.map(_._2)
        val words = lines.flatMap(_.split(" "))
        val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
        wordCounts.print()
    
        // Start the computation
        ssc.start()
        ssc.awaitTermination()
      }
    }

10.2    生产环境维护offset
配置参数(Kryo序列化高效):
1.      sparkConf.set("spark.akka.frameSize", "2047")  
2.          sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")//一些默认的类使用kryo序列化  
3.          sparkConf.set("spark.kryoserializer.buffer.max.mb", "2040")  
4.          sparkConf.set("spark.files.overwrite","true")  
5.          sparkConf.set("spark.hadoop.validateOutputSpecs", "false")  
6.          sparkConf.set("spark.eventLog.overwrite", "true")  
7.          sparkConf.set("spark.streaming.kafka.maxRatePerPartition","30") //每秒钟最大消费,而kafka拉的数据为topic对应partition的数量乘以设置的数  


package com.suning.mep.utils

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.Decoder
import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaCluster, KafkaUtils}

import scala.reflect.ClassTag

class KafkaClient(val kafkaParams: Map[String, String]) extends Serializable {
  private val kc = new KafkaCluster(kafkaParams)

  def createDirectStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](ssc: StreamingContext, topic: String): InputDStream[(K, V)] = {
    val partitionsEither = kc.getPartitions(Set(topic))
    if (partitionsEither.isLeft) throw new SparkException(s"get kafka partition failed: ${partitionsEither.left.get}")

    val partitions = partitionsEither.right.get
    val groupId = kafkaParams.get("group.id").get

    val offsets = setOrUpdateOffsets(partitions, groupId)
    KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)](ssc, kafkaParams, offsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message))
  }

  private def setOrUpdateOffsets(partitions: Set[TopicAndPartition], groupId: String): Map[TopicAndPartition, Long] = {
    val consumerOffsetEither = kc.getConsumerOffsets(groupId, partitions)
    if (consumerOffsetEither.isLeft) {
      val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)

      var leaderOffsets: Map[TopicAndPartition, LeaderOffset] = null
      if (reset == Some("smallest")) {
        val leaderOffsetsEither = kc.getEarliestLeaderOffsets(partitions)
        if (leaderOffsetsEither.isLeft) throw new SparkException(s"get earliest leader offsets failed: ${leaderOffsetsEither.left.get}")
        leaderOffsets = leaderOffsetsEither.right.get
      } else {
        val leaderOffsetsEither = kc.getLatestLeaderOffsets(partitions)
        if (leaderOffsetsEither.isLeft) throw new SparkException(s"get latest leader offsets failed: ${leaderOffsetsEither.left.get}")
        leaderOffsets = leaderOffsetsEither.right.get
      }

      val offsets = leaderOffsets.map {
        case (tp, offset) => (tp, offset.offset)
      }

      kc.setConsumerOffsets(groupId, offsets)
      offsets
    } else {
      val earliestLeaderOffsetsEither = kc.getEarliestLeaderOffsets(partitions)
      if (earliestLeaderOffsetsEither.isLeft) throw new SparkException(s"get earliest leader offsets failed: ${earliestLeaderOffsetsEither.left.get}")

      val earliestLeaderOffsets = earliestLeaderOffsetsEither.right.get
      val consumerOffsets = consumerOffsetEither.right.get

      var offsets: Map[TopicAndPartition, Long] = Map()
      consumerOffsets.foreach({ case (tp, n) =>
        val earliestLeaderOffset = earliestLeaderOffsets(tp).offset
        if (n < earliestLeaderOffset) {
          offsets += (tp -> earliestLeaderOffset)
        } else {
          offsets += (tp -> n)
        }
      })

      if (!offsets.isEmpty) {
        kc.setConsumerOffsets(groupId, offsets)
      }
      offsets
    }

      /**
        * 如果streaming程序执行的时候出现kafka.common.OffsetOutOfRangeException,
        * 说明zk上保存的offsets已经过时了,即kafka的定时清理策略已经将包含该offsets的文件删除。
        * 针对这种情况,只要判断一下zk上的consumerOffsets和earliestLeaderOffsets的大小,
        * 如果consumerOffsets比earliestLeaderOffsets还小的话,说明consumerOffsets已过时,
        * 这时把consumerOffsets更新为earliestLeaderOffsets
        */
  }

  def updateOffsets(rdd: RDD[(String, String)]): Unit = {
    val groupId = kafkaParams.get("group.id").get
    val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
    for (offsets <- offsetsList) {
      val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition)
      val o = kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsets.untilOffset)))
      if (o.isLeft) {
        println(s"Error updating the offset to Kafka cluster: ${o.left.get}")
      }
    }
  }
}

发送方:
val kafkaProducerFunc  = () => {
    val config = {
      val p = new Properties()
      p.setProperty("bootstrap.servers", ScmConfUtil.getInstance().getString("bootstrap.servers",""))
      p.setProperty("key.serializer", classOf[StringSerializer].getName)
      p.setProperty("value.serializer", classOf[StringSerializer].getName)
      p
    }
    val producer = new KafkaProducer[String, String](config)
    sys.addShutdownHook {
      // Ensure that, on executor JVM shutdown, the Kafka producer sends
      // any buffered messages to Kafka before shutting down.
      producer.close()
    }
    producer
  }

  val kafkaProducer = kafkaProducerFunc()

示例代码:
  val kafkaParams = Map[String, String](
      "metadata.broker.list" -> ScmConfUtil.getInstance().getString("order.metadata.broker.list",""),
      "auto.offset.reset" -> ScmConfUtil.getInstance().getString("auto.offset.reset","smallest"),
      "group.id" -> ScmConfUtil.getInstance().getString("order.group.id","spark_rtppmep"))
    val kafkaClient = new KafkaClient(kafkaParams)
    // sparkContext
    //val broadcastVar=ssc.sparkContext.broadcast(RedisUtils.getInstance())

    val message = kafkaClient.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, "ppmep_topic")
 

转载于:https://my.oschina.net/u/778683/blog/1831143

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值