SparkStreaming
文章目录
1、环境准备
导入依赖(pom.xml)
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-yarn_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.27</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.10.1</version>
</dependency>
2、创建DStream
监听端口
-
ssc.socketTextStream(hostname: String, port: Int)
- hostname:主机名
- port:端口号
import org.apache.spark.SparkConf import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream} import org.apache.spark.streaming.{Duration, StreamingContext} object Spark01_Streaming_WordCount { def main(args: Array[String]): Unit = { //TODO 创建采集器SparkStreaming //配置信息 val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStreaming") // 创建采集器SparkStreaming,第一个参数(配置信息),第二个参数(时间(3s)) val ssc = new StreamingContext(sparkConf, Duration(3 * 1000)) //TODO 业务逻辑 //通过监控端口创建DStream val wordLine: ReceiverInputDStream[String] = ssc.socketTextStream("hadoop151", 9999) //WordCount业务逻辑 val words: DStream[String] = wordLine.flatMap(_.split(" ")) val wordToMap: DStream[(String, Int)] = words.map((_, 1)) val wordToCount: DStream[(String, Int)] = wordToMap.reduceByKey(_ + _) //输出 wordToCount.print() //TODO 启动采集器 ssc.start() //TODO 等待采集器的关闭 ssc.awaitTermination() } }
RDD 队列
-
ssc.queueStream(queue: Queue[RDD[T]], oneAtATime: Boolean)
- queue:RDD队列。此数据结构的修改必须同步。
- oneAtATime:在每个间隔中是否应仅从队列中消耗一个RDD
import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream} import org.apache.spark.streaming.{Duration, StreamingContext} import scala.collection.mutable object Spark02_Streaming_Queue { def main(args: Array[String]): Unit = { //TODO 创建采集器SparkStreaming //配置信息 val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStreaming") // 创建采集器SparkStreaming,第一个参数(配置信息),第二个参数(时间(3s)) val ssc = new StreamingContext(sparkConf, Duration(3 * 1000)) //TODO 业务逻辑 val queue: mutable.Queue[RDD[Int]] = new mutable.Queue[RDD[Int]]() //使用队列创建DStream val words: InputDStream[Int] = ssc.queueStream(queue, oneAtATime = false) val wordToMap: DStream[(Int, Int)] = words.map((_, 1)) val wordToCount: DStream[(Int, Int)] = wordToMap.reduceByKey(_ + _) //输出 wordToCount.print() //TODO 启动采集器 ssc.start() //定时添加信息 for(i <- 1 to(5)){ queue += ssc.sparkContext.makeRDD(1 to(100), 10) Thread.sleep(2000) } //TODO 等待采集器的关闭 ssc.awaitTermination() } }
Kafka
KafkaUtils.createDirectStream(
ssc: StreamingContext, //采集器
locationStrategy: LocationStrategy, //区位策略(系统的策略)
consumerStrategy: ConsumerStrategy[K, V] //消费信息,Set("fzk"):(主题),kafkaPara:(kafka参数)
)
-
程序
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord} import org.apache.spark.SparkConf import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} import org.apache.spark.streaming.{Duration, StreamingContext} object Spark04_Streaming_Kafka { def main(args: Array[String]): Unit = { //TODO 创建采集器SparkStreaming //配置信息 val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStreaming") // 创建采集器SparkStreaming,第一个参数(配置信息),第二个参数(时间(3s)) val ssc = new StreamingContext(sparkConf, Duration(3 * 1000)) //TODO 业务逻辑 //定义 Kafka 参数 val kafkaPara: Map[String, Object] = Map[String, Object]( //地址+端口号 ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "hadoop151:9092,hadoop152:9092,hadoop153:9092", //kafka的分组 ConsumerConfig.GROUP_ID_CONFIG -> "fzk", //key的实例化 "key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer", //value的实例化 "value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer" ) //读取 Kafka 数据创建 DStream val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream( ssc, //采集器 //区位策略(系统的策略) LocationStrategies.PreferConsistent, //消费信息,Set("fzk"):(主题),kafkaPara:(kafka参数) ConsumerStrategies.Subscribe[String, String](Set("fzk"), kafkaPara) ) val valueDStream: DStream[String] = kafkaDStream.map(_.value()) valueDStream.print() //TODO 启动采集器 ssc.start() //TODO 等待采集器的关闭 ssc.awaitTermination() } }
自定义
-
ssc.receiverStream(receiver: Receiver[T])
-
receiver:自定义数据源
-
第一步:自定义数据源(继承 Receiver,并实现 onStart、onStop 方法来自定义数据源采集)
class MyReceiver extends Receiver[String](StorageLevel.MEMORY_ONLY){ private var flag: Boolean = true override def onStart(): Unit = { //业务逻辑 } override def onStop(): Unit = { //业务逻辑 } }
-
第二步:
import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.{Duration, StreamingContext} import scala.util.Random object Spark03_Streaming_Receiver { def main(args: Array[String]): Unit = { //TODO 创建采集器SparkStreaming //配置信息 val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStreaming") // 创建采集器SparkStreaming,第一个参数(配置信息),第二个参数(时间(3s)) val ssc = new StreamingContext(sparkConf, Duration(3 * 1000)) //TODO 业务逻辑 //自定义方式创建DStream val data: ReceiverInputDStream[String] = ssc.receiverStream(new MyReceiver) data.print() //TODO 启动采集器 ssc.start() //TODO 等待采集器的关闭 ssc.awaitTermination() } }
3、DStream 转换
- DStream 上的操作与 RDD 的类似,分为 Transformations(转换)和 Output Operations(输出)两种,此外转换操作中还有一些比较特殊的原语,如:updateStateByKey()、**transform()**以及各种 Window 相关的原语
无状态转化操作
-
把简单的 RDD 转化操作应用到每个批次上,也就是转化 DStream 中的每一个 RDD。注意,针对键值对的 DStream 转化操作(比如reduceByKey())要添加 import StreamingContext._才能在 Scala 中使用
Transform
-
Transform 允许 DStream 上执行任意的RDD-to-RDD 函数。即使这些函数并没有在DStream 的 API 中暴露出来,通过该函数可以方便的扩展 Spark API。该函数每一批次调度一次。其实也就是对 DStream 中的 RDD 应用转换
-
def transform[U: ClassTag](transformFunc: RDD[T] => RDD[U]): DStream[U]
//将DString转换成RDD,进行RDD操作 DStream.transform(rdd => rdd)
join
-
两个流之间的join 需要两个流的批次大小一致,这样才能做到同时触发计算。计算过程就是对当前批次的两个流中各自的RDD 进行 join,与两个 RDD 的 join 效果相同
-
def join[W: ClassTag](other: DStream[(K, W)]): DStream[(K, (V, W))]
DStream.join(DStream)
有状态转化操作
UpdateStateByKey
-
UpdateStateByKey 原语用于记录历史记录,有时,我们需要在 DStream 中跨批次维护状态(例如流计算中累加wordcount)。updateStateByKey()为我们提供了对一个状态变量的访问,用于键值对形式的 DStream。给定一个由(键,事件)对构成的 DStream,并传递一个指定如何根据新的事件更新每个键对应状态的函数,它可以构建出一个新的 DStream,其内部数据为(键,状态) 对。
-
updateStateByKey() 的结果是一个新的DStream,内部的RDD 序列是由每个时间区间对应的(键,状态)对组成的。
-
使用(updateStateByKey)
- 第一步:设置检查点路径
ssc.checkpoint("ps")
- 第二步:使用updateStateByKey
def updateStateByKey[S: ClassTag](updateFunc: (Seq[V], Option[S]) => Option[S]): DStream[(K, S)]
import org.apache.spark.SparkConf import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream} import org.apache.spark.streaming.{Duration, StreamingContext} object Spark05_Streaming_State { def main(args: Array[String]): Unit = { //TODO 创建采集器SparkStreaming val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStreaming") val ssc = new StreamingContext(sparkConf, Duration(3 * 1000)) //检查点路径 ssc.checkpoint("ps") //TODO 业务逻辑 val datas: ReceiverInputDStream[String] = ssc.socketTextStream("hadoop151", 9999) val map: DStream[(String, Int)] = datas.map((_, 1)) //记录历史记录,必须添加检查点路径 val value: DStream[(String, Int)] = map.updateStateByKey[Int]( (values: Seq[Int], buff: Option[Int]) => { Some(buff.getOrElse(0) + values.sum) }) value.print() //TODO 启动采集器 ssc.start() //TODO 等待采集器的关闭 ssc.awaitTermination() } }
- 第一步:设置检查点路径
WindowOperations
-
Window Operations 可以设置窗口的大小和滑动窗口的间隔来动态的获取当前Steaming 的允许状态。所有基于窗口的操作都需要两个参数,分别为窗口时长以及滑动步长
-
窗口时长(windowDuration):计算内容的时间范围(时长必须是采集周期的整数倍)
-
滑动步长(slideDuration):隔多久触发一次计算(步长必须是采集周期的整数倍)
-
操作方法
- window
- 基于对源DStream 窗化的批次进行计算返回一个新的Dstream
def window(windowDuration: Duration, slideDuration: Duration): DStream[T]
- countByWindow
- 返回一个滑动窗口计数流中的元素个数
def countByWindow(windowDuration: Duration, slideDuration: Duration): DStream[Long]
- reduceByWindow
- 通过使用自定义函数整合滑动区间流元素来创建一个新的单元素流
def reduceByWindow(Func: (T, T) => T, windowDuration: Duration, slideDuration: Duration): DStream[T]
- window
-
使用(window)
import org.apache.spark.SparkConf import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream} import org.apache.spark.streaming.{Duration, StreamingContext} object Spark06_Streaming_Window { def main(args: Array[String]): Unit = { //TODO 创建采集器SparkStreaming val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStreaming") val ssc = new StreamingContext(sparkConf, Duration(3 * 1000)) //TODO 业务逻辑 val datas: ReceiverInputDStream[String] = ssc.socketTextStream("hadoop151", 9999) //滑动窗口,第一个参数(窗口时长),第二个参数(滑动步长),这两个参数必须是采集周期的整数倍 val windowDStream: DStream[String] = datas.window(Duration(6 * 1000), Duration(3 * 1000)) windowDStream.print() //TODO 启动采集器 ssc.start() //TODO 等待采集器的关闭 ssc.awaitTermination() } }
-
4、DStream 输出
-
print()
-
在运行流程序的驱动结点上打印 DStream 中每一批次数据的最开始 10 个元素
DStream.print()
-
-
saveAsTextFiles(prefix: String, suffix: String = "")
-
以 text 文件形式存储这个 DStream 的内容
DStream.saveAsTextFiles()
-
-
saveAsObjectFiles(prefix: String, suffix: String = "")
-
以 Java 对象序列化的方式将 Stream 中的数据保存为 SequenceFiles
DStream.saveAsObjectFiles()
-
-
saveAsHadoopFiles[F <: OutputFormat[K, V]](prefix: String, suffix: String)
-
将 Stream 中的数据保存为 Hadoop files
DStream.saveAsHadoopFiles()
-
-
foreachRDD(foreachFunc: RDD[T] => Unit)
-
这是最通用的输出操作,即将函数 func 用于产生于 stream 的每一个 RDD
DStream.foreachRDD()
-
5、优雅关闭
-
直接关闭
ssc.stop()
-
优雅关闭
-
第一步:设置开启 优雅的关闭
sparkConf.set("spark.streaming.stopGracefullyOnShutdown", "true")
-
第二步:优雅停止
ssc.stop(stopSparkContext = true, stopGracefully = true)
import org.apache.spark.SparkConf import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream} import org.apache.spark.streaming.{Duration, StreamingContext, StreamingContextState} object Spark07_Streaming_Close { def main(args: Array[String]): Unit = { //TODO 创建采集器SparkStreaming //配置信息 val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStreaming") //设置优雅的关闭 sparkConf.set("spark.streaming.stopGracefullyOnShutdown", "true") val ssc = new StreamingContext(sparkConf, Duration(3 * 1000)) //TODO 业务逻辑 //TODO 启动采集器 ssc.start() //优雅停止 new Thread(new Runnable { override def run(): Unit = { while(true){ if(true){ //第三方判断是否停止 //获取当前采集器状态 val state: StreamingContextState = ssc.getState //判断采集器是否处于活动状态 if (state == StreamingContextState.ACTIVE) { //优雅停止 ssc.stop(stopSparkContext = true, stopGracefully = true) //退出 System.exit(0) } } } } }).start() //TODO 等待采集器的关闭 ssc.awaitTermination() } }
-