【saprk Steaming】

小雪x

已于 2023-04-25 19:47:47 修改

阅读量129

点赞数

文章标签： spark hadoop 大数据

于 2023-04-24 22:23:51 首次发布

本文链接：https://blog.csdn.net/m0_67239108/article/details/130353081

版权

一、spark-streaming概述

二、有状态计算：每一次计算之前的计算结果进行累加计算

三、窗口计算

四、DStream与RDD相互转换

例题：卡口车辆

一、spark-streaming概述

Spark Streaming 是一个基于 Spark Core 之上的实时计算框架，可以从很多数据源消费数据并对数据进行实时的处理，具有高吞吐量和容错能力强等特点。实时计算是一种持续、低延时、事件触发的计算任务。

每隔五秒就产生一个RDD。
如果下一次执行之前，上一次的数据没有处理完毕，就会造成雪崩。
spark streaming处理数据的方式是微批处理，每隔一段时间处理一次,每次处理的数据不一样。

过程：

1、创建spark环境

val conf = new SparkConf()
conf.setMaster("local[2]") //接收数据需要一个资源，处理数据也需要一个资源
conf.setAppName("wc")

val sc = new SparkContext(conf)

2、创建spark streaming环境

val ssc = new StreamingContext(sc, Durations.seconds(5)) //每隔五秒执行一次

3、读取数据

val linesDS: ReceiverInputDStream[String] = ssc.socketTextStream("master", 8888)

4、计算

5、启动spark streaming 任务

ssc.start()
ssc.awaitTermination()
ssc.stop()
}

代码：统计单词的数量

package com.shujia.spark.stream

import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object Demo1WordCount {
  def main(args: Array[String]): Unit = {
    /**
     * 1、创建spark环境
     */
    val conf = new SparkConf()
    conf.setMaster("local[2]") //接收数据需要一个资源，处理数据也需要一个资源
    conf.setAppName("wc")

    val sc = new SparkContext(conf)

    /**
     * 2、创建spark streaming环境
     */
    val ssc = new StreamingContext(sc, Durations.seconds(5))

    /**
     * 3、读取数据
     * nc -lk 8888
     *
     * 如果命令找不到需要安装一下啊
     * yum install nc
     *
     * DStream:底层也是RDD,每隔一段时间封装一个RDD
     */

    val linesDS: ReceiverInputDStream[String] = ssc.socketTextStream("master", 8888)

    /**
     * 4、统计单词的数量
     */
    val countDS: DStream[(String, Int)] = linesDS
      .flatMap(line => line.split(","))
      .map((_, 1))
      .reduceByKey((x, y) => x + y)

    //打印计算结果
    countDS.print()

    /**
     * 5、启动spark streaming任务
     */
    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }
}

二、有状态计算：每一次计算之前的计算结果进行累加计算

有状态算子UpdateStateByKey：每一次计算基于前一次的状态进行累加计算。
Checkpoint：使用checkpoint保存之前的结果到HDFS中。
状态：之前的统计结果。

示意图：

代码：

package com.shujia.spark.stream

import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object Demo2UpdateStateByKey {
  def main(args: Array[String]): Unit = {
    //创建spark环境
    val conf = new SparkConf()
    conf.setAppName("state")
    conf.setMaster("local[2]")
    val sc = new SparkContext(conf)

    val ssc = new StreamingContext(sc, Durations.seconds(5))

    ssc.checkpoint("data/checkpoint")

    //读取数据
    val linesDS: ReceiverInputDStream[String] = ssc.socketTextStream("master", 8888)

    //转换成kv格式
    val kvDS: DStream[(String, Int)] = linesDS
      .flatMap(line => line.split(","))
      .map(word => (word, 1))


    /**
     * updateFun: 每一个key，每隔批次执行一次
     *
     * @param seq   :当前批次一个keu所有的value
     * @param state :之前批次的计算结果（状态），如果是第一个批次，状态中没有数据
     * @return 返回最新计算结果
     */
    def updateFun(seq: Seq[Int], state: Option[Int]): Option[Int] = {
      println(s"当前批次的数据：$seq")
      println(s"之前批次的状态：$state")
      //1、获取之前的计算结果
      //取出状态中保存的单词的数量，如果没有值，返回0
      val lastCount: Int = state.getOrElse(0)

      //2、计算当前批次单词的数量
      val count: Int = seq.sum

      //3、放回总的单词的数量
      Option(lastCount + count)
    }

    /**
     * updateStateByKey: 有状态算子，每一次计算基于前一次的状态进行累加计算
     * 需要设置checkpoint，用于保存计算的状态
     */
    val countDS: DStream[(String, Int)] = kvDS.updateStateByKey(updateFun)

    countDS.print()


    ssc.start()
    ssc.awaitTermination()
    ssc.stop()

  }

}

三、窗口计算

reduceByKeyAndWindow ：每隔一段时间计算最近一段时间的数据。
滑动窗口：每隔5秒计算最近15秒的数据，一般用于热门榜单。
优化：

示意图：

窗口优化：

代码：

package com.shujia.spark.stream

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}

object Demo3Window {
  def main(args: Array[String]): Unit = {
    //创建spark环境
    val conf = new SparkConf()
    conf.setAppName("state")
    conf.setMaster("local[2]")
    val sc = new SparkContext(conf)

    val ssc = new StreamingContext(sc, Durations.seconds(5))

    ssc.checkpoint("data/checkpoint")

    //读取数据
    val linesDS: ReceiverInputDStream[String] = ssc.socketTextStream("master", 8888)

    //转换成kv格式
    val kvDS: DStream[(String, Int)] = linesDS
      .flatMap(line => line.split(","))
      .map(word => (word, 1))

    /**
     * reduceByKeyAndWindow" 窗口计算，每隔一段时间计算最近一段时间的数据
     */

    /*    val countDS: DStream[(String, Int)] = kvDS.reduceByKeyAndWindow(
          reduceFunc = (x, y) => x + y, //聚合函数
          windowDuration = Durations.seconds(15), //窗口大小
          slideDuration = Durations.seconds(5) //滑动时间
        )*/

    //对滑动拆给你扣的计算进行优化，可以比秒重复计算

    val countDS: DStream[(String, Int)] = kvDS.reduceByKeyAndWindow(
      (x, y) => x + y, //加的函数
      (i, j) => i - j, //减的函数
      windowDuration = Durations.seconds(15), //窗口大小
      slideDuration = Durations.seconds(5) //滑动时间
    )

    countDS.print()

    ssc.start()
    ssc.awaitTermination()
    ssc.stop()

  }

}

四、DStream与RDD相互转换

1、DStreamtoRDD

foreachRDD：将DStream转换成RDD, 每隔一段时间一个RDD

package com.shujia.spark.stream

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.apache.spark.streaming.dstream.ReceiverInputDStream

object Demo4DStreamToRDD {
  def main(args: Array[String]): Unit = {
    //创建spark环境
    val spark: SparkSession = SparkSession.builder()
      .master("local[2]")
      .appName("rdd")
      .config("spark.sql.shuffle.partitions", 1)
      .getOrCreate()

    import spark.implicits._
    import org.apache.spark.sql.functions._

    val sc: SparkContext = spark.sparkContext

    val ssc = new StreamingContext(sc, Durations.seconds(5))

    ssc.checkpoint("data/checkpoint")

    //读取数据
    val linesDS: ReceiverInputDStream[String] = ssc.socketTextStream("master", 8888)

    /**
     * foreachRDD： 将DStream转换成RDD, 每隔一段时间一个RDD
     * 每循环一次，rdd中就是一个批次的数据
     */
    linesDS.foreachRDD(rdd => {
      println("执行foreachRDD")
      /**
       * 里面的代码每隔一段时间执行一次，每一次的数据不一样
       */

      //使用RDD 的APi统计单词的数量
      val countRDD: RDD[(String, Int)] = rdd
        .flatMap(line => line.split(","))
        .map(word => (word, 1))
        .reduceByKey((x, y) => x + y)
      //countRDD.foreach(println)

      /**
       * 将rdd转换成DF，使用Sql或者DSL处理数据
       */

      val linesDF: DataFrame = rdd.toDF()

      val countDF: DataFrame = linesDF
        .select(explode(split($"value", ",")) as "word")
        .groupBy($"word")
        .agg(count($"word") as "num")

      countDF.show()
    })

    ssc.start()
    ssc.awaitTermination()
    ssc.stop()

  }

}

2、RDDToDStream

transform: 将DS转换成RDD,再将RDD转换成DS

package com.shujia.spark.stream

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}

object Demo5RDDToDStream {
  def main(args: Array[String]): Unit = {
    //创建spark环境
    val conf = new SparkConf()
    conf.setAppName("state")
    conf.setMaster("local[2]")
    val sc = new SparkContext(conf)

    val ssc = new StreamingContext(sc, Durations.seconds(5))

    ssc.checkpoint("data/checkpoint")

    //读取数据
    val linesDS: ReceiverInputDStream[String] = ssc.socketTextStream("master", 8888)

    /**
     * transform: 将DS转换成RDD,再将RDD转换成DS
     */
    val countDF: DStream[(String, Int)] = linesDS.transform(rdd => {

      //使用RDD的APi统计单词的数量
      val countRDD: RDD[(String, Int)] = rdd
        .flatMap(_.split(","))
        .map((_, 1))
        .reduceByKey(_ + _)

      //返回一个新的RDD
      countRDD
    })
    countDF.print()


    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }

}

例题：卡口车辆

对于JSON格式的数据

将json字符串转换成json对象 val jsonObj: JSONObject = JSON.parseObject(line)
通过key获取value val card: Long = jsonObj.getLong("card")

package com.shujia.spark.stream

import com.alibaba.fastjson.{JSON, JSONObject}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Durations, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}

import java.lang
import java.sql.{Connection, DriverManager, PreparedStatement}
import java.text.SimpleDateFormat
import java.util.Date

object Demo7Car {
  def main(args: Array[String]): Unit = {
    //创建spark环境
    val conf = new SparkConf()
    conf.setAppName("state")
    conf.setMaster("local[2]")
    val sc = new SparkContext(conf)

    val ssc = new StreamingContext(sc, Durations.seconds(5))

    ssc.checkpoint("data/checkpoint")

    //读取数据
    val linesDS: ReceiverInputDStream[String] = ssc.socketTextStream("master", 8888)

    //解析数据，取出卡口编号和车速
    val cardAndSpeedDS: DStream[(Long, (Double, Int))] = linesDS.map(line => {
      //将json字符串转换成json对象
      val jsonObj: JSONObject = JSON.parseObject(line)
      //通过key获取value
      val card: Long = jsonObj.getLong("card")
      val speed: Double = jsonObj.getDouble("speed")
      (card, (speed, 1))
    })

    /**
     * 卡口拥堵情况统计，
     * 实时统计每个卡口的最近一段时间的车流量和平均车速，每隔一段时间统计一次
     *
     */
    //计算总的车速和总的车流量
    val sumSpeedAndFlow: DStream[(Long, (Double, Int))] = cardAndSpeedDS
      .reduceByKeyAndWindow(
        (x, y) => (x._1 + y._1, x._2 + y._2),
        windowDuration = Durations.seconds(20),
        slideDuration = Durations.seconds(10)
      )

    //计算平均车速
    val resultDS: DStream[(Long, String, Int, Double)] = sumSpeedAndFlow
      .map {
        case (card: Long, (speed: Double, flow: Int)) =>
          val avgSpeed: Double = speed / flow

          //获取当前的计算时间
          val date = new Date()
          //格式日期
          val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
          val comDate: String = format.format(date)

          (card, comDate, flow, avgSpeed)
      }

    /**
     * 将统计统计结果保存到数据库中
     */
    resultDS.foreachRDD(rdd => {
      //循环将数据保存到Mysql中
      /**
       * foreachPartition: 每一次遍历一个分区的数据
       * iter: 分区内所有的数据
       */
      rdd.foreachPartition(iter => {
        /**
         * 为每一个分区创建一个数据库的连接,而不是每一条数据创建一个连接
         */
        println("创建数据库连接")
        val start: Long = System.currentTimeMillis()
        //1、加载驱动
        Class.forName("com.mysql.jdbc.Driver")
        //2、创建数据库连接
        val con: Connection = DriverManager.getConnection("jdbc:mysql://master:3306/bigdata", "root", "123456")
        val end: Long = System.currentTimeMillis()
        println(end - start)

        //foreach: 是迭代器的方式内外都在Executor
        iter.foreach {
          case (card: Long, comDate: String, flow: Int, avgSpeed: Double) =>
            //3、编写sql
            val stat: PreparedStatement = con.prepareStatement("insert into card_avg_speed_flow values(?,?,?,?)")
            //4、赋值
            stat.setLong(1, card)
            stat.setString(2, comDate)
            stat.setInt(3, flow)
            stat.setDouble(4, avgSpeed)
            //5、执行数据插入
            stat.execute()
        }
        //6、关闭连接
        con.close()
      })
    })
    ssc.start()
    ssc.awaitTermination()
    ssc.stop()

  }

}