Spark Streaming
http://spark.apache.org/docs/2.2.1/streaming-programming-guide.html
Spark Streaming 的checkPoint:
提供故障容错
To summarize, metadata checkpointing is primarily needed for recovery from driver failures, whereas data or RDD checkpointing is necessary even for basic functioning if stateful transformations are used.
DStream的理解:一系列带有时间间隔的RDD
操作DStream的3个重要的原语
代码之前准备:集群安装
安装netcat
yum -y install nc
window operations
具体窗口大小和窗口间隔,比如需要需要展示的是上一次和这一次的10个数据
package zygDemo1
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
/**
* Created by HP on 2019/7/16 15:51
*/
object WindowOperationsDemo {
def main(args: Array[String]): Unit = {
//初始化环境
val conf=new SparkConf().setAppName("WindowOperationsDemo").setMaster("local[*]")
val sc=new SparkContext(conf)
//实例化StreamingContext
val strCon: StreamingContext = new StreamingContext(sc,Durations.milliseconds(1000))
//获取NetCat模拟发送的数据
val msg: ReceiverInputDStream[String] = strCon.socketTextStream("mini2",9999)
//wordcount简单数据拼接 word word hello (world,1)(word,1)(hello,1)
val tup: DStream[(String, Int)] = msg.flatMap(_.split(" ")).map((_,1))
//窗口聚合 参数(聚合func,窗口大小,滑动间隔)(窗口解析见图)
val sumd: DStream[(String, Int)] = tup.reduceByKeyAndWindow((x:Int, y:Int)=>(x+y),Durations.seconds(10),Durations.seconds(10))
sumd.print()
//提交任务到集群
strCon.start()
//线程等待,等待处理任务
strCon.awaitTermination()
}
}
updateStateByKey
历史数据批次累加
package zygDemo1
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf}
/**
* Created by HP on 2019/7/16 11:24
*/
object UpdateStateByKeyDemo{
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("UpdateStateByKeyDemo").setMaster("local[2]")
val strCon: StreamingContext = new StreamingContext(conf,Durations.milliseconds(2000))//conf和批处理时间间隔
//设置检查点
strCon.checkpoint("d://zyg-20190716")
//输入流
val msg=strCon.socketTextStream("mini2",9999)
//处理逻辑
val tup=msg.flatMap(_.split(" ")).map((_,1))
//历史数据和当前数据聚合,制定分区器,是否获取分区信息
val sumd: DStream[(String, Int)] = tup.updateStateByKey(func,new HashPartitioner(strCon.sparkContext.defaultParallelism),true)
sumd.print()
strCon.start()
strCon.awaitTermination()
}
//参数(word,(1,1,1))(hello,(1,1))
//历史数据和当前数据的聚合逻辑(迭代器参数:key,(1,1),历史数据)
val func=(it:Iterator[(String,Seq[Int],Option[Int])])=>{
it.map(x=>{
(x._1,x._2.sum+x._3.getOrElse(0))
})
}
}
transform
允许将任意RDD到RDD函数应用于DStream
package zygDemo1
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by HP on 2019/7/16 10:26
* nc -lk 9999 启动netcat(网络服务)
*/
object StreamingWC {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("StreamingWC").setMaster("local[*]")
val sc=new SparkContext(conf)
val strCon: StreamingContext = new StreamingContext(sc,Durations.milliseconds(1000))
//获取NetCat的数据
val msg: ReceiverInputDStream[String] = strCon.socketTextStream("mini2",9999)
//数据处理逻辑
val sumd: DStream[(String, Int)] = msg.transform(rdd => {
rdd.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
})
sumd.print()
//提交任务到集群
strCon.start()
//线程等待,等待处理任务
strCon.awaitTermination()
}
}