大数据-SparkStreaming(三)
Transformation 高级算子
- updateStateByKey
需求:sparkStreaming接受socket数据实现所有批次的单词次数累加
package com.kaikeba.streaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* 实现把所有批次的单词出现的次数累加
*/
object UpdateStateBykeyWordCount {
def main(args: Array[String]): Unit = {
// todo: 1、创建SparkConf对象
val sparkConf: SparkConf = new SparkConf()
.setAppName("TcpWordCount")
.setMaster("local[2]")
// todo: 2、创建StreamingContext对象
val ssc = new StreamingContext(sparkConf,Seconds(2))
//需要设置checkpoint目录,用于保存之前批次的结果数据,该目录一般指向hdfs路径
ssc.checkpoint("hdfs://node01:8020/ck")
//todo: 3、接受socket数据
val socketTextStream: ReceiverInputDStream[String] = ssc.socketTextStream("node01",9999)
//todo: 4、对数据进行处理
val wordAndOneDstream: DStream[(String, Int)] = socketTextStream.flatMap(_.split(" ")).map((_,1))
val result: DStream[(String, Int)] = wordAndOneDstream.updateStateByKey(updateFunc)
//todo: 5、打印结果
result.print()
//todo: 6、开启流式计算
ssc.start()
ssc.awaitTermination()
}
//currentValue:当前批次中每一个单词出现的所有的1
//historyValues:之前批次中每个单词出现的总次数,Option类型表示存在或者不存在。 Some表示存在有值,None表示没有
def updateFunc(currentValue:Seq[Int], historyValues:Option[Int]):Option[Int] = {
val newValue: Int = currentValue.sum + historyValues.getOrElse(0)
Some(newValue)
}
}
- mapWithState
需求:sparkStreaming接受socket数据实现所有批次的单词次数累加
package com.kaikeba.streaming
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, MapWithStateDStream, ReceiverInputDStream}
import org.apache.spark.streaming._
/**
* mapWithState实现把所有批次的单词出现的次数累加
* --性能更好
*/
object MapWithStateWordCount {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.ERROR)
// todo: 1、创建SparkConf对象
val sparkConf: SparkConf = new SparkConf().setAppName("MapWithStateWordCount").setMaster("local[2]")
// todo: 2、创建StreamingContext对象
val ssc = new StreamingContext(sparkConf,Seconds(2))
val initRDD: RDD[(String, Int)] = ssc.sparkContext.parallelize((List(("hadoop",10),("spark",20))))
//需要设置checkpoint目录,用于保存之前批次的结果数据,该目录一般指向hdfs路径
ssc.checkpoint("hdfs://node01:8020/ck")
//todo: 3、接受socket数据
val socketTextStream: ReceiverInputDStream[String] = ssc.socketTextStream("node01",9999)
//todo: 4、对数据进行处理
val wordAndOneDstream: DStream[(String, Int)] = socketTextStream.flatMap(_.split(" ")).map((_,1))
val stateSpec=StateSpec.function((time:Time,key:String,currentValue:Option[Int],historyState:State[Int])=>{
//当前批次结果与历史批次的结果累加
val sumValue: Int = currentValue.getOrElse(0)+ historyState.getOption().getOrElse(0)
val output=(key,sumValue)
if(!historyState.isTimingOut()){
historyState.update(sumValue)
}
Some(output)
//给一个初始的结果initRDD
//timeout: 当一个key超过这个时间没有接收到数据的时候,这个key以及对应的状态会被移除掉
}).initialState(initRDD).timeout(Durations.seconds(5))
//使用mapWithState方法,实现累加
val result: MapWithStateDStream[String, Int, Int, (String, Int)] = wordAndOneDstream.mapWithState(stateSpec)
//todo: 5、打印所有批次的结果数据
result.stateSnapshots().print()
//todo: 6、开启流式计算
ssc.start()
ssc.awaitTermination()
}
}
- transform
需求:获取每一个批次中单词出现次数最多的前3位
package com.kaikeba.streaming
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
/**
* 获取每一个批次中单词出现次数最多的前3位
*/
object TransformWordCount {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.ERROR)
// todo: 1、创建SparkConf对象
val sparkConf: SparkConf = new SparkConf().setAppName("TransformWordCount").setMaster("local[2]")
// todo: 2、创建StreamingContext对象
val ssc = new StreamingContext(sparkConf,Seconds(2))
//todo: 3、接受socket数据
val socketTextStream: ReceiverInputDStream[String] = ssc.socketTextStream("node01",9999)
//todo: 4、对数据进行处理
val result: DStream[(String, Int)] = socketTextStream.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
//todo: 5、将Dstream进行transform方法操作
val sortedDstream: DStream[(String, Int)] = result.transform(rdd => {
//对单词出现的次数进行排序
val sortedRDD: RDD[(String, Int)] = rdd.sortBy(_._2, false)
val top3: Array[(String, Int)] = sortedRDD.take(3)
println("------------top3----------start")
top3.foreach(println)
println("------------top3------------end")
sortedRDD
})
//todo: 6、打印该批次中所有单词按照次数降序的结果
sortedDstream.print()
//todo: 7、开启流式计算
ssc.start()
ssc.awaitTermination()
}
}
- Window操作
需求:实现每隔4秒统计6秒的数据
package com.kaikeba.streaming
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
/**
* todo:实现每隔4秒统计6秒的数据
*/
object ReduceByKeyAndWindowWordCount {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.ERROR)
// todo: 1、创建SparkConf对象
val sparkConf: SparkConf = new SparkConf().setAppName("ReduceByKeyAndWindowWordCount").setMaster("local[2]")
// todo: 2、创建StreamingContext对象
val ssc = new StreamingContext(sparkConf,Seconds(2))
//todo: 3、接受socket数据
val socketTextStream: ReceiverInputDStream[String] = ssc.socketTextStream("node01",9999)
//todo: 4、对数据进行处理
val result: DStream[(String, Int)] = socketTextStream.flatMap(_.split(" ")).map((_,1))
//todo: 5、每隔4秒统计6秒的数据
/**
* 该方法需要三个参数:
* reduceFunc: (V, V) => V, ---> 就是一个函数
* windowDuration: Duration, ---> 窗口的大小(时间单位),该窗口会包含N个批次的数据
* slideDuration: Duration ---> 滑动窗口的时间间隔,表示每隔多久计算一次
*/
val windowDStream: DStream[(String, Int)] = result.reduceByKeyAndWindow((x:Int,y:Int)=>x+y,Seconds(6),Seconds(4))
//todo: 6、打印该批次中所有单词按照次数降序的结果
windowDStream.print()
//todo: 7、开启流式计算
ssc.start()
ssc.awaitTermination()
}
}