Spark Streaming + Kafka 工具代码(Scala语言)

该博客提供了Spark Streaming结合Kafka的Scala代码示例,包括从Kafka获取byte数组、处理Avro格式文件、Avro转Json、TweetDecoder解码、窗口操作及OffsetsStore管理等多个实用场景。
摘要由CSDN通过智能技术生成

Spark Streaming + Kafka 代码样例(Scala语言)

一些很常用的工具类搬运

Sample1 (Array[byte])

主要演示了从kafka中获取byte数组的方法

package spark.test

import data.processing.avro.AvroDecoder
import kafka.serializer.StringDecoder
import kafka.serializer.DefaultDecoder
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka.KafkaUtils


object StreamingApp {
   
  def main(args: Array[String]): Unit = {
   
    val conf = new SparkConf().setAppName("Simple Streaming Application")
    val ssc = new StreamingContext(conf, Seconds(1))

    val topicsSet = "test".split(",").toSet
    val kafkaParams = Map[String, String]("metadata.broker.list" -> "localhost:9092")

    val directKafkaStream = KafkaUtils.createDirectStream[String, Array[Byte], StringDecoder, DefaultDecoder](
      ssc, kafkaParams, topicsSet
    )



    directKafkaStream.foreachRDD(rdd =>
      rdd.foreachPartition(partitionOfRecords => {
   
        val avroDecoder = new AvroDecoder("/event-record.json")
        partitionOfRecords.map(m => (m._1, avroDecoder.decode(m._2))).foreach(m => println(m))
    }))


    ssc.start()
    ssc.awaitTermination()
  }
} 

Sample2 (Sample1的扩展)

package tools

import kafka.serializer.DefaultDecoder
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils

case class KafkaPayload(value: Array[Byte])

class KafkaDStreamSource(config: Map[String, String]) {
   

  def createSource(ssc: StreamingContext, topic: String): DStream[KafkaPayload] = {
   
    val kafkaParams = config
    val kafkaTopics = Set(topic)

    KafkaUtils.
      createDirectStream[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder](
      ssc,
      kafkaParams,
      kafkaTopics).
      map(dStream => KafkaPayload(dStream._2))
  }

}

object KafkaDStreamSource {
   
  def apply(config: Map[String, String]): KafkaDStreamSource = new KafkaDStreamSource(config)
} 

Sample3

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.apache.spark.streaming.kafka.KafkaUtils
import kafka.serializer.StringDecoder
import org.apache.spark.storage.StorageLevel

object StationJourneyCountCustomApp {
   

  def main(args: Array[String]) {
   
    if (args.length != 7) {
   
      System.err.println(
        "Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
      System.exit(1)
    }

    val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
      //.set("spark.streaming.receiver.writeAheadLog.enable", "true")

    val ssc = new StreamingContext(conf, Seconds(10))
    ssc.checkpoint(checkpointDir)

    val topics = Map[String, Int](
      topic -> 1)
    val params = Map[String, String](
      "zookeeper.connect" -> zkQuorum,
      "group.id" -> consumerGroupId,
      "bootstrap.servers" -> brokerUrl)
    KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2)
      .map(rec => rec.split(","))
      .map(rec => ((rec(3), rec(7)), 1))
      .reduceByKey(_ + _)
      .repartition(1)
      .map(rec => (rec._2, rec._1))
      .transform(rdd => rdd.sortByKey(ascending = false))
      .saveAsTextFiles(outputPath)

    ssc.start()
    ssc.awaitTermination()
  }

} 

Sample4(Avro格式文件)

import Schemas.{
   Sales_v2, Shipments_v1}
import io.confluent.kafka.serializers.KafkaAvroDecoder
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{
   Seconds, StreamingContext}
import org.apache.spark.SparkConf
import
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值