package streaming
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* 这个类挺重要的前面我们讲了,DStream的源代码发现slideDuration这个方法是一个RDD的周期,
* 而Window函数里有两个(windowDuration: Duration, slideDuration: Duration),那么哪个参数是决定一个RDD的呢
*
*
*
*/
object Windowing {
def main (args: Array[String]) {
val conf = new SparkConf().setAppName("Windowing").setMaster("local[4]")
val sc = new SparkContext(conf)
// streams will produce data every second
val ssc = new StreamingContext(sc, Seconds(1))
val qm = new QueueMaker(sc, ssc)
// create the stream
val stream = qm.inputStream
// register for data -- a five second sliding window every two seconds
/**
* 先看看官方介绍
* 返回一个新的DStream,其中每个RDD包含在此DStream上的滑动时间窗口中看到的所有元素。新的DStream生成与此DStream具有相同间隔的RDD。
* 这里有个必须注意的地方:
* window(windowDuration: Duration, slideDuration: Duration)中参数的大小,必须是new StreamingContext(sc, Seconds(1))
* 中Seconds(1)的整数倍
*
* window有两个函数
* windowDuration:窗口宽度
* slideDuration:滑动间隔
* def window(windowDuration: Duration, slideDuration: Duration): DStream[T] = ssc.withScope {
* new WindowedDStream(this, windowDuration, slideDuration)
* }
* def window(windowDuration: Duration, slideDuration: Duration): DStream[T] = ssc.withScope {
* new WindowedDStream(this, windowDuration, slideDuration)
* }
* 当不传第二个参数的时候slideDuration的大小默认和new StreamingContext(sc, Seconds(1))中的Seconds(1)相同
*
* 当两个参数都传入的时候这个RDD的周期怎么计算呢
*
* override def compute(validTime: Time): Option[RDD[T]] = {
* val currentWindow = new Interval(validTime - windowDuration + parent.slideDuration, validTime)
* val rddsInWindow = parent.slice(currentWindow)
* Some(ssc.sc.union(rddsInWindow))
* }
*
* 当前窗口期currentWindow的值怎么算
* validTime数据截止的时间戳
* windowDuration是window(Seconds(5), Seconds(2))中的第一个参数所以是5秒
* parent.slideDuration是new StreamingContext(sc, Seconds(1))中的Seconds(1)所以是1秒
*所以计算完之后是4秒
* val rddsInWindow = parent.slice(currentWindow)通过这个截取RDD并对现有RDD进行合并
* 所以最后的流程是:
* 1 new StreamingContext(sc, Seconds(1))每秒一次读取QueueMaker中的值
* 2 stream.window(Seconds(5), Seconds(2)),每隔2秒取最近5秒的记录(为什么是5,因为他算上了当前那一秒)
*
*
*/
stream.window(Seconds(5), Seconds(2)).foreachRDD(r => {
println(new SimpleDateFormat("yyyyMMddHHmmss").format(new Date().getTime))
if (r.count() == 0)
println("Empty")
else
println("Count = " + r.count() + " min = " + r.min()+ " max = " + r.max())
})
// start streaming
ssc.start()
new Thread("Delayed Termination") {
override def run() {
qm.populateQueue()
Thread.sleep(20000)
println("*** stopping streaming")
ssc.stop()
}
}.start()
try {
ssc.awaitTermination()
println("*** streaming terminated")
} catch {
case e: Exception => {
println("*** streaming exception caught in monitor thread")
}
}
}
}
Githup项目LearningSpark代码讲解(九)
最新推荐文章于 2024-07-20 01:01:27 发布