Structured Streaming下Multiple Streaming Aggregations的支持

最新推荐文章于 2021-11-18 22:53:47 发布

隔壁寝室老吴

最新推荐文章于 2021-11-18 22:53:47 发布

阅读量752

点赞数 1

本文链接：https://blog.csdn.net/qq_32323239/article/details/114988109

版权

在Structured Streaming下面我们如果对一个Dataset，重复的去执行聚合操作，可能会报以下错误：

Multiple streaming aggregations are not supported with streaming DataFrames/Datasets

Multiple mapGroupsWithStates are not supported on a streaming DataFrames/Datasets

Mixing mapGroupsWithStates and flatMapGroupsWithStates are not supported on a streaming DataFrames/Datasets

当出现以上错误的时候，在很长一段时间内，我都认为spark压根就不支持多个聚合链，直到昨天我看源码终于发现了这么一句话：

Multiple flatMapGroupsWithStates are not supported when they are not all in append mode or the output mode is not append on a streaming DataFrames/Datasets

看到这我发现，多个flatMapGroupsWithStates连着进行聚合的时候，只能在append模式下出现。所以这也就是说是可以实现多个连续聚合操作的，只是说只能多个flatMapGroupsWithStates并且是append输出模式就可以了。

直接看代码吧：

object PaasPvUv {

  val dateFormat: FastDateFormat = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .registerKryoClasses(Array(classOf[AidUvState], classOf[Roaring64Bitmap]))

    val spark = SparkSession.builder
      .appName("StructuredStreamingTest")
      .master("local[*]")
      .config(conf)
      .config("spark.sql.shuffle.partitions", "1")
      .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    // Create DataFrame representing the stream of input lines from connection to localhost:9999
    val lines = spark.readStream
      .format("socket")
      .option("host", "192.168.1.171")
      .option("port", 9999)
      .load()

    import spark.implicits._
//    implicit val stateEncoder = org.apache.spark.sql.Encoders.bean(classOf[AidUvState])
//    implicit val resultEncoder = org.apache.spark.sql.Encoders.bean(classOf[AidUvResult])

    val query = lines.as[String].map(line => {
      println("line===>", line)
      val arr = line.split(";")
      PaasInput(arr(0), arr(1), arr(2), arr(3), this.formatTime2TimeStamp(arr(4)))
    }).as[PaasInput]
      // 这里必须是要有WaterMark的
      .withWatermark("inputtime", "1 minutes")
      .groupByKey(line=>(line.app_id, line.aid))
      .flatMapGroupsWithState(OutputMode.Append(), GroupStateTimeout.EventTimeTimeout())(updateAcrossEvents2)
      // 这里必须是要有WaterMark的
      .withWatermark("inputtime", "1 minutes")
      .groupByKey(_.app_id)
      .flatMapGroupsWithState(OutputMode.Append(), GroupStateTimeout.EventTimeTimeout())(updateAppIdAcrossEvent2)
      .writeStream
      .outputMode(OutputMode.Append())
      .queryName("Aid_Uv_Count")
      .format("console")
      .start()

    query.awaitTermination()

  }

  def updateAppIdAcrossEvent2(appId: String, events: Iterator[AidUvResult], oldState: GroupState[AppIdState]): Iterator[AppIdResult] = {
    var state = if(oldState.exists) oldState.get else AppIdState(appId, new Timestamp(0L), 0L, 0L)
    if (oldState.hasTimedOut) {
      val state = oldState.get
      oldState.remove()
      Iterator(AppIdResult(appId, state.uv, state.pv, state.inputtime, true))
    } else {
      var timemax: Long = state.inputtime.getTime
      for (event <- events) {
        timemax = math.max(timemax, event.inputtime.getTime)

        state = state.copy(app_id = event.app_id, inputtime = new Timestamp(timemax), uv = state.uv+event.uv, pv=state.pv+event.pv)
      }
      oldState.update(state)
      oldState.setTimeoutTimestamp(timemax, "5 seconds")
      val ret = AppIdResult(app_id = appId, state.uv, state.pv, new Timestamp(timemax), false)
      println("ret:", ret)
      Iterator(ret)
    }
  }

  def updateAcrossEvents2(groupKey: (String, String), events: Iterator[PaasInput], oldState: GroupState[AidUvState]): Iterator[AidUvResult] = {
    var state = if(oldState.exists) oldState.get else AidUvState(groupKey._1, groupKey._2, new Timestamp(0L), new mutable.MutableList[String], new mutable.MutableList[String])

    if (oldState.hasTimedOut) {
      val state = oldState.get
      oldState.remove()
      Iterator(AidUvResult(groupKey._1, groupKey._2, state.uids.size, state.sessions.size, state.inputtime, true))
    } else {
      var timemax: Long = state.inputtime.getTime
      for (event <- events) {
        timemax = math.max(timemax, event.inputtime.getTime)
        if(!state.uids.contains(event.uid)){
          state = state.copy(app_id = event.app_id, aid = event.aid, inputtime = new Timestamp(timemax), uids = state.uids :+ event.uid, state.sessions)
        }
        if(!state.sessions.contains(event.s)){
          state = state.copy(app_id = event.app_id, aid = event.aid, inputtime = new Timestamp(timemax), state.uids, sessions = state.sessions :+ event.s)
        }
        state = state.copy(app_id = event.app_id, aid = event.aid, inputtime = new Timestamp(timemax), state.uids, state.sessions)
      }
      oldState.update(state)
      oldState.setTimeoutTimestamp(timemax, "5 seconds")
      val ret = AidUvResult(app_id = groupKey._1, aid = groupKey._2, state.uids.size, state.sessions.size, new Timestamp(timemax), false)
      println("ret:", ret)
      Iterator(ret)
    }
  }

  def formatTime2TimeStamp(timeStr: String): Timestamp ={
    val milliSeconds = dateFormat.parse(timeStr).getTime
    val timestamp = new Timestamp(milliSeconds)

    timestamp
  }

}

然后输入数据以下：

9320432d;lss_0bc36d14;9320432d_4444444441520999402417;9320432d_444444444;2021-03-15 16:19:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402098;9320432d_444444444;2021-03-15 16:20:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402098;9320432d_444444444;2021-03-15 16:21:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402417;9320432d_444444444;2021-03-15 16:22:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402098;9320432d_444444444;2021-03-15 16:23:00
9320432d;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:24:00
9320432d;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:25:00
9320432d;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:26:00
93204234;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:26:00
93204234;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:27:00
93204234;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:28:00

输出结果如下：