SparkStreaming中的reduceByWindow窗口操作:
统计当前10S长度窗口中的数,每隔5S接收的数据格式是:楼下的也是用的以下数据
1
1
2
package spark.streaming.sparkStreaming.havaState
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by MK on 2017/8/21.
*/
object reduceByWindow {
def main(args: Array[String]): Unit = {
val appName = "SparkStreaming"
val conf = new SparkConf().setAppName(appName).setMaster("local[2]")
val ssc: StreamingContext = new StreamingContext(conf, Seconds(5))
ssc.checkpoint("C:\\Users\\MK\\Desktop\\tmp\\check")
//kafka参数
val kafkaParam = Map[String, String](//kafka低级api配置
"zookeeper.connect" -> "sxjdb03:2181,sxjdb01:2181", //配置zookeeper args(0)
"metadata.broker.list" -> "sxjdb01:9092",
"group.id" -> "gtTest1", //设置一下group id
"auto.offset.reset" -> kafka.api.OffsetRequest.LargestTimeString, //从该topic最新的位置开始读数
"client.id" -> "gtTest1",
"zookeeper.connection.timeout.ms" -> "10000"
)
val topicSet = Set("t_mk")
val directKafka: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParam, topicSet)
val windowed = directKafka.map(x => x._2.toInt)
//每过5S计算一次过去10S的数据
val result = windowed.reduceByWindow({ (x, y) => x + y }, Seconds(10), Seconds(5))
/**
* 没有数据的时候不会报错,它会一直等待,所以不需要进行判断
**/
result.print()
ssc.start()
ssc.awaitTermination()
}
}
SparkStreaming中的reduceByKeyAndWindow窗口操作:
统计当前10S长度窗口中的数据
package spark.streaming.sparkStreaming.havaState
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by MK on 2017/8/21.
*/
object reduceByKeyAndWindow {
def main(args: Array[String]): Unit = {
val appName = "SparkStreaming"
val conf = new SparkConf().setAppName(appName).setMaster("local[2]")
val ssc: StreamingContext = new StreamingContext(conf, Seconds(5))
ssc.checkpoint("C:\\Users\\MK\\Desktop\\tmp\\check")
//kafka参数
val kafkaParam = Map[String, String](//kafka低级api配置
"zookeeper.connect" -> "sxjdb03:2181,sxjdb01:2181", //配置zookeeper args(0)
"metadata.broker.list" -> "sxjdb01:9092",
"group.id" -> "gtTest1", //设置一下group id
"auto.offset.reset" -> kafka.api.OffsetRequest.LargestTimeString, //从该topic最新的位置开始读数
"client.id" -> "gtTest1",
"zookeeper.connection.timeout.ms" -> "10000"
)
val topicSet = Set("t_mk")
val directKafka: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParam, topicSet)
val windowed = directKafka.map(x => x._2.toInt)
//每过5S计算一次过去10S的数据
val result = windowed.map((_, 1)).reduceByKeyAndWindow((a: Int, b: Int) => a + b, Seconds(10), Seconds(5))
// .reduceByKeyAndWindow(_+_,Seconds(10),Seconds(5))
/**
* 没有数据的时候不会报错,它会一直等待,所以不需要进行判断
**/
result.print()
ssc.start()
ssc.awaitTermination()
}
}
SparkStreaming中的countByWindow窗口操作:
统计当前10S长度窗口中的元素个数
package spark.streaming.sparkStreaming.havaState
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by MK on 2017/8/21.
* 有状态操作之countByWindow操作,统计当前10S长度窗口中的元素个数
*/
object countByWindowSparkStreaming {
def main(args: Array[String]): Unit = {
val appName = "SparkStreaming"
val conf = new SparkConf().setAppName(appName).setMaster("local[2]")
val ssc: StreamingContext = new StreamingContext(conf, Seconds(5))
ssc.checkpoint("C:\\Users\\MK\\Desktop\\tmp\\check")
//kafka参数
val kafkaParam = Map[String, String](//kafka低级api配置
"zookeeper.connect" -> "sxjdb03:2181,sxjdb01:2181", //配置zookeeper args(0)
"metadata.broker.list" -> "sxjdb01:9092",
"group.id" -> "gtTest1", //设置一下group id
"auto.offset.reset" -> kafka.api.OffsetRequest.LargestTimeString, //从该topic最新的位置开始读数
"client.id" -> "gtTest1",
"zookeeper.connection.timeout.ms" -> "10000"
)
val topicSet = Set("test.kakouAlarm01")
val directKafka: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParam, topicSet)
val windowed = directKafka.countByWindow(Seconds(10), Seconds(5))
/**
* 没有数据的时候不会报错,它会一直等待,所以不需要进行判断
**/
windowed.print()
ssc.start()
ssc.awaitTermination()
}
}
SparkStreaming中的countByValueAndWindow窗口操作:
统计的是当前时间窗口中的元素个数(也就是bacth{5S}-->new StreamingContext(conf, Seconds(5)))
package spark.streaming.sparkStreaming.havaState
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by MK on 2017/8/21.
* 有状态操作之countByValueAndWindow操作,统计的是当前时间窗口中的元素个数(也就是bacth{5S}-->new StreamingContext(conf, Seconds(5)))
*/
object countByValueAndWindowSparkStreaming {
def main(args: Array[String]): Unit = {
val appName = "SparkStreaming"
val conf = new SparkConf().setAppName(appName).setMaster("local[2]")
val ssc: StreamingContext = new StreamingContext(conf, Seconds(5))
ssc.checkpoint("C:\\Users\\MK\\Desktop\\tmp\\checks")
//kafka参数
val kafkaParam = Map[String, String](//kafka低级api配置
"zookeeper.connect" -> "sxjdb03:2181,sxjdb01:2181", //配置zookeeper args(0)
"metadata.broker.list" -> "sxjdb01:9092",
"group.id" -> "gtTest1", //设置一下group id
"auto.offset.reset" -> kafka.api.OffsetRequest.LargestTimeString, //从该topic最新的位置开始读数
"client.id" -> "gtTest1",
"zookeeper.connection.timeout.ms" -> "10000"
)
val topicSet = Set("t_mk")
val directKafka: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParam, topicSet)
val windowed = directKafka.countByValueAndWindow(Seconds(10), Seconds(5))
/**
* 没有数据的时候不会报错,它会一直等待,所以不需要进行判断
**/
windowed.print()
ssc.start()
ssc.awaitTermination()
}
}
UpdateStateByKey:
将DStream中的数据进行按key做reduce操作,然后对各个批次的数据进行累加(前提是调用一个函数)
package spark.streaming.sparkStreaming.havaState
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by MK on 2017/8/21.
*/
object updateStateByKey {
def main(args: Array[String]): Unit = {
val appName = "SparkStreaming"
val conf = new SparkConf().setAppName(appName).setMaster("local[2]")
val ssc: StreamingContext = new StreamingContext(conf, Seconds(5))
ssc.checkpoint("C:\\Users\\MK\\Desktop\\tmp\\check")
//kafka参数
val kafkaParam = Map[String, String](//kafka低级api配置
"zookeeper.connect" -> "sxjdb03:2181,sxjdb01:2181", //配置zookeeper args(0)
"metadata.broker.list" -> "sxjdb01:9092",
"group.id" -> "gtTest1", //设置一下group id
"auto.offset.reset" -> kafka.api.OffsetRequest.LargestTimeString, //从该topic最新的位置开始读数
"client.id" -> "gtTest1",
"zookeeper.connection.timeout.ms" -> "10000"
)
val topicSet = Set("t_mk")
val directKafka: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParam, topicSet)
val windowed = directKafka.map(x => x._2.toInt)
//每过5S计算一次过去10S的数据
val result: DStream[(Int, Int)] = windowed.map((_, 1))
/**
* updateByKey:DStream中的数据进行按key做reduce操作,然后对各个批次的数据进行累加(前提是调用一个函数)
**/
val addFunc = (currValues: Seq[Int], prevValueState: Option[Int]) => {
//通过Spark内部的reduceByKey按key规约,然后这里传入某key当前批次的Seq/List,再计算当前批次的总和
val currentCount = currValues.sum
// 已累加的值
val previousCount = prevValueState.getOrElse(0)
// 返回累加后的结果,是一个Option[Int]类型
Some(currentCount + previousCount)
}
val s: DStream[(Int, Int)] = result.updateStateByKey(addFunc)
s.print()
ssc.start()
ssc.awaitTermination()
}
}