精髓 spark kafka producer和Consumer

 Spark 消费kafka消息,并且写入Producer中作用域懒加载(foreachPartition)级别(部分转载-部分原创:QQ438509676)


===============================version===========================================

<scala.version>2.11.8</scala.version>
<spark.version>2.2.0</spark.version>
<kafka.version>0.10.2.1</kafka.version>

===============================Pom xml===========================================

<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming_2.11</artifactId>
    <version>2.2.0</version>
</dependency>

<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
    <version>2.2.0</version>
    <exclusions>
        <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId></exclusion>
        <exclusion> <groupId>log4j</groupId> <artifactId>log4j</artifactId> </exclusion>
    </exclusions>
</dependency>

===============================Spark kafka========================================

import java.util.Properties
import com.mobike.dmp.utils.KafkaSink
import org.apache.kafka.clients.producer.{ProducerRecord, KafkaProducer}
import org.apache.kafka.common.serialization.{StringSerializer, StringDeserializer}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe

/**
  * Created by JohnVictor on 2018/9/11.
  */
object BrushOrderPushTj {
  def main(args: Array[String]) {
    val conf = new SparkConf()//.setMaster("local[*]").
     // .setAppName("T10LOCATION")
    val ssc = new StreamingContext(conf, Seconds(1800))
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "localhost:9092,localhost:9093,localhost:9094",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "group_id_bikeinfo",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
      val kafkaProducerConfig = {
        val p = new Properties()
        p.setProperty("bootstrap.servers","localhost1:9092,localhost1:9093,localhost1:9094")
        p.setProperty("key.serializer", classOf[StringSerializer].getName)
        p.setProperty("value.serializer", classOf[StringSerializer].getName)
        p
      }
      println("kafka producer init done!")
      ssc.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig))
    }
    val topics = Array("T10LOCATION")
    val stream = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )//.repartition(10)

    stream.foreachRDD { recordBikeInfos =>
      val offsetRanges = recordBikeInfos.asInstanceOf[HasOffsetRanges].offsetRanges
      if (!recordBikeInfos.isEmpty) {
        recordBikeInfos.foreach(record => {
          kafkaProducer.value.send("T10LOCATION", record.value())
          println("info :"+record.value())
          println("successed send kafka data")
          // do something else
        })
      }
      stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
    }
    ssc.start()
    ssc.awaitTermination()
  }
}

===============================KafkaSink 懒加载========================================

 

/**
  *
  *对于每个partition的每条记录,我们都需要创建KafkaProducer,
  *然后利用producer进行输出操作,注意这里我们并不能将KafkaProducer的新建任务放在foreachPartition外边,
  * 因为KafkaProducer是不可序列化的(not serializable)。
  * 显然这种做法是不灵活且低效的,因为每条记录都需要建立一次连接
  *
  */

import java.util.concurrent.Future
import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerRecord, RecordMetadata }
class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable {
  /* This is the key idea that allows us to work around running into
     NotSerializableExceptions. */
  lazy val producer = createProducer()
  def send(topic: String, key: K, value: V): Future[RecordMetadata] =
    producer.send(new ProducerRecord[K, V](topic, key, value))
  def send(topic: String, value: V): Future[RecordMetadata] =
    producer.send(new ProducerRecord[K, V](topic, value))
}

object KafkaSink {

  import scala.collection.JavaConversions._

  def apply[K, V](config: Map[String, Object]): KafkaSink[K, V] = {
    val createProducerFunc = () => {
      val producer = new KafkaProducer[K, V](config)
      sys.addShutdownHook {
        // Ensure that, on executor JVM shutdown, the Kafka producer sends
        // any buffered messages to Kafka before shutting down.
        producer.close()
      }
      producer
    }
    new KafkaSink(createProducerFunc)
  }

  def apply[K, V](config: java.util.Properties): KafkaSink[K, V] = apply(config.toMap)
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值