Spark Streaming中的transform算子
作用:业务需求需要更改数据结构时可以使用transform完成转化工作
示例:从Kafka中读取数据,经过transform操作转换之后,打印
添加依赖:
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.11</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-streams</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.6.6</version>
</dependency>
scala代码实现:
import java.text.SimpleDateFormat
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkTransform {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("kafkaDemo").setMaster("local[2]")
val streamingContext = new StreamingContext(sparkConf,Seconds(2))
streamingContext.checkpoint("checkpoint")
//kafka的配置参数
val kafkaParams = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.237.100:9092"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG -> "kafkaGroup1")
)
val kafkaStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
streamingContext,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkKafkaDemo"), kafkaParams)
)
// 业务需求需要更改数据结构时可以使用transform完成转化工作(参数是一个rdd和timestamp)
val numStream: DStream[((String, String), Int)] = kafkaStream.transform((rdd, timestamp) => {
val format = new SimpleDateFormat("yyyyMMdd HH:mm:ss")
val time: String = format.format(timestamp.milliseconds)
val value: RDD[((String, String), Int)] = rdd.flatMap(x => x.value().split("\\s+"))
.map(x => ((x, time), 1))
.reduceByKey(_+_)
.sortBy(x=>x._2,false)
value
})
numStream.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}
kafka生产者输入:
>hello world 0
>hello world 1
>hello world 2
>hello world 3
>hello world 4
>hello world 5
输出结果:
-------------------------------------------
Time: 1608790906000 ms
-------------------------------------------
((world,20201224 14:21:46),2)
((hello,20201224 14:21:46),2)
((1,20201224 14:21:46),1)
((0,20201224 14:21:46),1)
-------------------------------------------
Time: 1608790908000 ms
-------------------------------------------
((hello,20201224 14:21:48),4)
((world,20201224 14:21:48),4)
((4,20201224 14:21:48),1)
((5,20201224 14:21:48),1)
((3,20201224 14:21:48),1)
((2,20201224 14:21:48),1)
transform算子是对RDD操作,如果有的一些功能是Rdd本身没有提供的,可以通过自定义实现。那么接下来我们看下Rdd是如何实现的。
源码实现:
/**
* Return a new DStream in which each RDD is generated by applying a function
* on each RDD of 'this' DStream.
*/
def transform[U: ClassTag](transformFunc: RDD[T] => RDD[U]): DStream[U] = ssc.withScope {
// because the DStream is reachable from the outer object here, and because
// DStreams can't be serialized with closures, we can't proactively check
// it for serializability and so we pass the optional false to SparkContext.clean
// 清理掉无用的并且不需要序列化的属性,减少不必要的序列化异常。
val cleanedF = context.sparkContext.clean(transformFunc, false)
transform((r: RDD[T], _: Time) => cleanedF(r))
}
/**
* Return a new DStream in which each RDD is generated by applying a function
* on each RDD of 'this' DStream.
*/
def transform[U: ClassTag](transformFunc: (RDD[T], Time) => RDD[U]): DStream[U] = ssc.withScope {
// because the DStream is reachable from the outer object here, and because
// DStreams can't be serialized with closures, we can't proactively check
// it for serializability and so we pass the optional false to SparkContext.clean
// 清理掉无用的并且不需要序列化的属性,减少不必要的序列化异常。
val cleanedF = context.sparkContext.clean(transformFunc, false)
val realTransformFunc = (rdds: Seq[RDD[_]], time: Time) => {
assert(rdds.length == 1)
cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
}
new TransformedDStream[U](Seq(this), realTransformFunc)
}
DStream提供了两个transform算子:
def transform[U: ClassTag](transformFunc: RDD[T] => RDD[U]): DStream[U]
def transform[U: ClassTag](transformFunc: (RDD[T], Time) => RDD[U]): DStream[U]
第一个transform的transformFunc入参只有RDD
第二个transform的transformFunc入参有两个,分别是RDD和Time即批次时间。
上面的最终调用的也是下面的transform方法。下面的相比上面的除了可以获取到RDD信息,可以对RDD处理外,还可以获取到批次时间,可以依据批次时间做处理。
transform方法做了什么:
首先是清理对序列化清理操作。
接下来是把用户提供的function增加一行逻辑,即判断RDD序列是否只有一个RDD,如果不等于1则报错。
最后就是实例化TransformedDStream。
接下来看下TransformedDStream的执行逻辑
package org.apache.spark.streaming.dstream
import scala.reflect.ClassTag
import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}
private[streaming]
class TransformedDStream[U: ClassTag] (
parents: Seq[DStream[_]], // DStream序列,通过阅读上一级源码得知队列长度为1
transformFunc: (Seq[RDD[_]], Time) => RDD[U] // 用户定义的函数
) extends DStream[U](parents.head.ssc) {
// 一些检验
require(parents.nonEmpty, "List of DStreams to transform is empty")
require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
require(parents.map(_.slideDuration).distinct.size == 1,
"Some of the DStreams have different slide durations")
override def dependencies: List[DStream[_]] = parents.toList
override def slideDuration: Duration = parents.head.slideDuration
// 此方法为执行逻辑
override def compute(validTime: Time): Option[RDD[U]] = {
// 首先遍历parent序列,由于序列中DStream长度为1,所以只会取出一个DStream。
// 之后通过批次时间从DStream中取出当前批次时间的RDD并返回,如果没有对应RDD则抛出SparkException
val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse(
// Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE
throw new SparkException(s"Couldn't generate RDD from parent at time $validTime"))
}
// 以取出的RDD序列和批次时间作为入参,执行用户定义的函数。
// 这里需要回顾下上一段代码加的RDD序列长度为1的判断就是在这里执行的,
// 这个方法主要放置parent序列内有多个DStream,导致RDD序列也有多个,造成的异常。
// 但是从代码整体上来看在Spark代码没有bug的情况下不会出现报错。
val transformedRDD = transformFunc(parentRDDs, validTime)
// 判断返回的RDD是否为空,如果为空则报错
if (transformedRDD == null) {
throw new SparkException("Transform function must not return null. " +
"Return SparkContext.emptyRDD() instead to represent no element " +
"as the result of transformation.")
}
// 将RDD包装在Some中返回
Some(transformedRDD)
}
/**
* Wrap a body of code such that the call site and operation scope
* information are passed to the RDDs created in this body properly.
* This has been overridden to make sure that `displayInnerRDDOps` is always `true`, that is,
* the inner scopes and callsites of RDDs generated in `DStream.transform` are always
* displayed in the UI.
*/
override protected[streaming] def createRDDWithLocalProperties[U](
time: Time,
displayInnerRDDOps: Boolean)(body: => U): U = {
super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body)
}
}
核心的逻辑只是调用用户提供的function。
val transformedRDD = transformFunc(parentRDDs, validTime)
transform主要对RDD功能的增强。可以方便开发者提供获取RDD信息和批次时间信息,并且已经这些信息做一下处理。
示例2:spark streaming 和 spark-sql 结合
添加依赖:
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.5</version>
</dependency>
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkTransform2 {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("kafkaDemo2").setMaster("local[1]")
val streamingContext = new StreamingContext(sparkConf, Seconds(2))
streamingContext.checkpoint("checkpoint")
//kafka的配置参数
val kafkaParams = Map(
(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.237.100:9092"),
(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
(ConsumerConfig.GROUP_ID_CONFIG -> "kafkaGroup2")
)
val kafkaStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
streamingContext,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(Set("sparkKafkaDemo"), kafkaParams)
)
// 业务需求需要更改数据结构时可以使用transform完成转化工作
val numStream: DStream[Row] = kafkaStream.transform(rdd => {
val sqlContext: SQLContext = SQLContextSingleton.getInstance(rdd.sparkContext)
import sqlContext.implicits._
val words: RDD[String] = rdd.flatMap(_.value().toString.split("\\s+"))
val tupple2RDD: RDD[(String, Int)] = words.map((_, 1))
tupple2RDD.toDF("name", "cn")
.createOrReplaceTempView("tbwordcount") //不可以是createTempView()
val frame: DataFrame = sqlContext.sql("select name,count(cn) from tbwordcount2 group by name")
frame.rdd
})
numStream.print()
streamingContext.start()
streamingContext.awaitTermination()
}
}
object SQLContextSingleton {
@transient private var instance: SQLContext = _
def getInstance(sparkContext: SparkContext): SQLContext = {
synchronized(
if (instance == null) {
instance = new SQLContext(sparkContext)
}
)
instance
}
}
输入:
>hello world 0
>hello world 1
>hello world 2
>hello world 3
>hello world 4
>hello world 5
输出结果:
-------------------------------------------
Time: 1608794042000 ms
-------------------------------------------
[0,1]
[hello,2]
[1,1]
[world,2]
-------------------------------------------
Time: 1608794044000 ms
-------------------------------------------
[3,1]
[hello,4]
[5,1]
[world,4]
[4,1]
[2,1]