用spark将kafka的数据存到kafka中(方案二)

依赖
 <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.4.3</version>
        </dependency>

 <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.9.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-auth</artifactId>
            <version>2.9.2</version>
        </dependency>
  <!--对接Streaming-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.4.3</version>
        </dependency>
           <!--对接Kafka-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
            <version>2.4.3</version>
        </dependency>

代码

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer

object KafkaSink extends Serializable {

  def createKafkaConnection(): KafkaProducer[String, String] = {
    val props = new Properties()
    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,"CentOS:9092")
    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,classOf[StringSerializer].getName)
    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,classOf[StringSerializer].getName)
    props.put(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG,"true")//开启幂等性
    props.put(ProducerConfig.RETRIES_CONFIG,"2")//设置重试
    props.put(ProducerConfig.BATCH_SIZE_CONFIG,"100")//设置缓冲区大小
    props.put(ProducerConfig.LINGER_MS_CONFIG,"1000")//最多延迟1000毫秒
    new KafkaProducer[String,String](props)
  }

  lazy val kafkaProducer:KafkaProducer[String,String]= createKafkaConnection()
  Runtime.getRuntime.addShutdownHook(new Thread(){
    override def run(): Unit = {
      kafkaProducer.close()
    }
  })
  def save(vs: Iterator[(String, Int)]): Unit = {

    try{
      vs.foreach(tuple=>{
        val record = new ProducerRecord[String,String]("topic02",tuple._1,tuple._2.toString)
        kafkaProducer.send(record)
      })

    }catch {
      case e:Exception=> println("发邮件,出错啦~")
    }

  }
}
val checkpointDir="file:///D:/checkpointdir"
val ssc=StreamingContext.getOrCreate(checkpointDir,()=>{
    println("==========init ssc==========")
    val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount").setMaster("local[6]")
    val ssc = new StreamingContext(sparkConf, Seconds(2))
    ssc.checkpoint(checkpointDir)
    val kafkaParams = Map[String, Object](
        ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "CentOS:9092",
        ConsumerConfig.GROUP_ID_CONFIG -> "g1",
        ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
        ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer])


    //直接读取Kafka中的数据 将kafka独立维护Kafka topic的offset偏移量 checkpoint
    val messages = KafkaUtils.createDirectStream[String, String](ssc,
                                                                 LocationStrategies.PreferConsistent,//设置读取策略,如果你的spark计算节点和kafka broker节点不在一台物理主机
                                                                 ConsumerStrategies.Subscribe[String, String](List("topic01"), kafkaParams))

    messages.map(record=>record.value)
    .flatMap(line=>line.split(" "))
    .map(word => (word, 1))
    .mapWithState(StateSpec.function((k:String,v:Option[Int],stage:State[Int])=>{
        var total:Int=0
        if(stage.exists()){
            total=stage.getOption().getOrElse(0)
        }
        total += v.getOrElse(0)
        stage.update(total)//更新历史状态
        (k,total)
    }))
    .foreachRDD(rdd=>{
        rdd.foreachPartition(vs=>{
            KafkaSink.save(vs)
        })
    })
    ssc
})

ssc.sparkContext.setLogLevel("FATAL")

ssc.start()
ssc.awaitTermination()
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值