Spark Streaming + Kafka 代码样例(Scala语言)
一些很常用的工具类搬运
Sample1 (Array[byte])
主要演示了从kafka中获取byte数组的方法
package spark.test
import data.processing.avro.AvroDecoder
import kafka.serializer.StringDecoder
import kafka.serializer.DefaultDecoder
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka.KafkaUtils
object StreamingApp {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Simple Streaming Application")
val ssc = new StreamingContext(conf, Seconds(1))
val topicsSet = "test".split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> "localhost:9092")
val directKafkaStream = KafkaUtils.createDirectStream[String, Array[Byte], StringDecoder, DefaultDecoder](
ssc, kafkaParams, topicsSet
)
directKafkaStream.foreachRDD(rdd =>
rdd.foreachPartition(partitionOfRecords => {
val avroDecoder = new AvroDecoder("/event-record.json")
partitionOfRecords.map(m => (m._1, avroDecoder.decode(m._2))).foreach(m => println(m))
}))
ssc.start()
ssc.awaitTermination()
}
}
Sample2 (Sample1的扩展)
package tools
import kafka.serializer.DefaultDecoder
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
case class KafkaPayload(value: Array[Byte])
class KafkaDStreamSource(config: Map[String, String]) {
def createSource(ssc: StreamingContext, topic: String): DStream[KafkaPayload] = {
val kafkaParams = config
val kafkaTopics = Set(topic)
KafkaUtils.
createDirectStream[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder](
ssc,
kafkaParams,
kafkaTopics).
map(dStream => KafkaPayload(dStream._2))
}
}
object KafkaDStreamSource {
def apply(config: Map[String, String]): KafkaDStreamSource = new KafkaDStreamSource(config)
}
Sample3
package org.apress.prospark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.apache.spark.streaming.kafka.KafkaUtils
import kafka.serializer.StringDecoder
import org.apache.spark.storage.StorageLevel
object StationJourneyCountCustomApp {
def main(args: Array[String]) {
if (args.length != 7) {
System.err.println(
"Usage: StationJourneyCountApp <appname> <brokerUrl> <topic> <consumerGroupId> <zkQuorum> <checkpointDir> <outputPath>")
System.exit(1)
}
val Seq(appName, brokerUrl, topic, consumerGroupId, zkQuorum, checkpointDir, outputPath) = args.toSeq
val conf = new SparkConf()
.setAppName(appName)
.setJars(SparkContext.jarOfClass(this.getClass).toSeq)
//.set("spark.streaming.receiver.writeAheadLog.enable", "true")
val ssc = new StreamingContext(conf, Seconds(10))
ssc.checkpoint(checkpointDir)
val topics = Map[String, Int](
topic -> 1)
val params = Map[String, String](
"zookeeper.connect" -> zkQuorum,
"group.id" -> consumerGroupId,
"bootstrap.servers" -> brokerUrl)
KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, params, topics, StorageLevel.MEMORY_ONLY_SER).map(_._2)
.map(rec => rec.split(","))
.map(rec => ((rec(3), rec(7)), 1))
.reduceByKey(_ + _)
.repartition(1)
.map(rec => (rec._2, rec._1))
.transform(rdd => rdd.sortByKey(ascending = false))
.saveAsTextFiles(outputPath)
ssc.start()
ssc.awaitTermination()
}
}
Sample4(Avro格式文件)
import Schemas.{
Sales_v2, Shipments_v1}
import io.confluent.kafka.serializers.KafkaAvroDecoder
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{
Seconds, StreamingContext}
import org.apache.spark.SparkConf
import