窗口函数实现时间段内累加
object WindowOrderTotalStreaming {
//批次时间,Bat
val STREAMING_BATCH_INTERVAL = Seconds(5)
val STREAMING_WINDOW_INTERVAL = STREAMING_BATCH_INTERVAL * 3
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[3]") //为什么启动3个,有一个Thread运行Receiver
.setAppName("J_WindowOrderTotalStreaming")
val ssc: StreamingContext = new StreamingContext(conf, STREAMING_BATCH_INTERVAL)
//日志级别
ssc.sparkContext.setLogLevel("WARN")
val kafkaParams: Map[String, String] = Map(
"metadata.broker.list"->"bigdata-hpsk01.huadian.com:9092,bigdata-hpsk01.huadian.com:9093,bigdata-hpsk01.huadian.com:9094",
"auto.offset.reset"->"largest" //读取最新数据
)
val topics: Set[String] = Set("orderTopic")
val kafkaDStream: DStream[String] = KafkaUtils
.createDirectStream[String, String, StringDecoder,StringDecoder](
ssc,
kafkaParams,
topics
).map(_._2) //只需要获取Topic中每条Message中Value的值
//设置窗口
//val inputDStream = kafkaDStream.window(STREAMING_WINDOW_INTERVAL)
val inputDStream = kafkaDStream.window(STREAMING_WINDOW_INTERVAL)
val orderDStream: DStream[(Int, Int)] = inputDStream.transform(rdd=>{
rdd
//过滤不合法的数据
.filter(line => line.trim.length >0 && line.trim.split(",").length ==3)
//提取字段
.map(line =>{
val splits = line.split(",")
(splits(1).toInt,1)
})
})
//统计各个省份订单数目
orderDStream.reduceByKey( _ + _)
orderDStream.print()
//启动流式实时应用
ssc.start() // 将会启动Receiver接收器,用于接收源端 的数据
//实时应用一旦启动,正常情况下不会自动停止,触发遇到特性情况(报错,强行终止)
ssc.awaitTermination() // Wait for the computation to terminate
}
}
设置聚合函数窗口函数 ➡ reduceByKeyAndWindow,没设置检查点
object K_WindowOrderTotalStreaming {
//批次时间,Batch Interval
val STREAMING_BATCH_INTERVAL = Seconds(5)
//设置窗口时间间隔
val STREAMING_WINDOW_INTERVAL = STREAMING_BATCH_INTERVAL * 3
//设置滑动窗口时间间隔
val STREAMING_SLIDER_INTERVAL = STREAMING_BATCH_INTERVAL * 2
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[3]") //为什么启动3个,有一个Thread运行Receiver
.setAppName("J_WindowOrderTotalStreaming")
val ssc: StreamingContext = new StreamingContext(conf, STREAMING_BATCH_INTERVAL)
//日志级别
ssc.sparkContext.setLogLevel("WARN")
val kafkaParams: Map[String, String] = Map(
"metadata.broker.list"->"bigdata-hpsk01.huadian.com:9092,bigdata-hpsk01.huadian.com:9093,bigdata-hpsk01.huadian.com:9094",
"auto.offset.reset"->"largest" //读取最新数据
)
val topics: Set[String] = Set("orderTopic")
val kafkaDStream: DStream[String] = KafkaUtils
.createDirectStream[String, String, StringDecoder,StringDecoder](
ssc,
kafkaParams,
topics
).map(_._2) //只需要获取Topic中每条Message中Value的值
//设置窗口
val orderDStream: DStream[(Int, Int)] = kafkaDStream.transform(rdd=>{
rdd
//过滤不合法的数据
.filter(line => line.trim.length >0 && line.trim.split(",").length ==3)
//提取字段
.map(line =>{
val splits = line.split(",")
(splits(1).toInt,1)
})
})
/**
* reduceByKeyAndWindow = window + reduceByKey
* def reduceByKeyAndWindow(
* reduceFunc: (V, V) => V,
* windowDuration: Duration,
* slideDuration: Duration
* ): DStream[(K, V)]
*/
//统计各个省份订单数目
val orderCountDStream = orderDStream.reduceByKeyAndWindow(
(v1:Int, v2:Int) => v1 + v2,
STREAMING_WINDOW_INTERVAL,
STREAMING_SLIDER_INTERVAL
)
orderCountDStream.print()
//启动流式实时应用
ssc.start() // 将会启动Receiver接收器,用于接收源端 的数据
//实时应用一旦启动,正常情况下不会自动停止,触发遇到特性情况(报错,强行终止)
ssc.awaitTermination() // Wait for the computation to terminate
}
}
性能优化:使用reduceByKeyAndWindow 设置检查点,实现时间内累加
优化点:
(1)使用KafkaUtils方法增加数据的并行度
(2)设置了批时间和滑动窗口时间,时间重叠部分直接从上面拿下来
(v1:Int, v2:Int) => v1 + v2,
(v1:Int, v2:Int) => v1 - v2,
object L_TrendOrderTotalStreaming {
//批次时间,Batch Interval
val STREAMING_BATCH_INTERVAL = Seconds(5)
//设置窗口时间间隔
val STREAMING_WINDOW_INTERVAL = STREAMING_BATCH_INTERVAL * 3
//设置滑动窗口时间间隔
val STREAMING_SLIDER_INTERVAL = STREAMING_BATCH_INTERVAL * 2
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[3]") //为什么启动3个,有一个Thread运行Receiver
.setAppName("J_WindowOrderTotalStreaming")
val ssc: StreamingContext = new StreamingContext(conf, STREAMING_BATCH_INTERVAL)
//日志级别
ssc.sparkContext.setLogLevel("WARN")
ssc.checkpoint("chkpt-trend-1000")
val kafkaParams: Map[String, String] = Map(
"metadata.broker.list"->"bigdata-hpsk01.huadian.com:9092,bigdata-hpsk01.huadian.com:9093,bigdata-hpsk01.huadian.com:9094",
"auto.offset.reset"->"largest" //读取最新数据
)
val topics: Set[String] = Set("orderTopic")
val kafkaDStream: DStream[String] = KafkaUtils
.createDirectStream[String, String, StringDecoder,StringDecoder](
ssc,
kafkaParams,
topics
).map(_._2) //只需要获取Topic中每条Message中Value的值
//设置窗口
val orderDStream: DStream[(Int, Int)] = kafkaDStream.transform(rdd=>{
rdd
//过滤不合法的数据
.filter(line => line.trim.length >0 && line.trim.split(",").length ==3)
//提取字段
.map(line =>{
val splits = line.split(",")
(splits(1).toInt,1)
})
})
/**
def reduceByKeyAndWindow(
reduceFunc: (V, V) => V,
invReduceFunc: (V, V) => V,
windowDuration: Duration,
slideDuration: Duration,
partitioner: Partitioner,
filterFunc: ((K, V)) => Boolean
): DStream[(K, V)]
*/
//统计各个省份订单数目
val orderCountDStream = orderDStream.reduceByKeyAndWindow(
(v1:Int, v2:Int) => v1 + v2,
(v1:Int, v2:Int) => v1 - v2,
STREAMING_WINDOW_INTERVAL,
STREAMING_SLIDER_INTERVAL
)
orderCountDStream.print()
//启动流式实时应用
ssc.start() // 将会启动Receiver接收器,用于接收源端 的数据
//实时应用一旦启动,正常情况下不会自动停止,触发遇到特性情况(报错,强行终止)
ssc.awaitTermination() // Wait for the computation to terminate
}
}