Structured Streaming中的状态算子：flatMapGroupWithState、mapGroupWithState

最新推荐文章于 2022-06-26 07:10:35 发布

kiwi198709

最新推荐文章于 2022-06-26 07:10:35 发布

阅读量2.1k

点赞数 2

文章标签： Structured Streaming 状态算子 mapGroupWithState flatMapGroupWithState

本文链接：https://blog.csdn.net/kiwi198709/article/details/101567225

版权

前言

最近正在看spark的Structured Streaming，正好工作中需要在流式计算中实现一个类似分组排序的功能，现在使用Structured Streaming来实现。

代码


import java.sql.Timestamp

import com.xxx.source.Source
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode, Trigger}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.mutable.ListBuffer
import scala.collection.{Iterator, mutable}
import org.json4s._
import org.json4s.jackson.{Json, Serialization}


case class State(key: (String, String, String), latest: mutable.Map[String, Long])

case class Result(platform: String, fSid: String, ds: String, userId: String, guide_sid: String, recordTime: Timestamp)


object Top {
  def main(args: Array[String]): Unit = {
    val spark = session()
    spark.sparkContext.setLogLevel("WARN")
    import spark.implicits._
    val dataFrame = Source.kafkaSource(spark)
    val scheme = "user_id String,lv String,guide_sid String,my_colum_ String,vip String,record_time Long,f_sid String"
    val guide = dataFrame
      .selectExpr(s"from_json(cast (value as string),'$scheme') as bi", "topic")
      .selectExpr("bi.user_id as user_id", "bi.lv as lv", "bi.guide_sid as guide_sid", "bi.vip as vip",
        "bi.record_time as record_time", "bi.f_sid as f_sid", "bi.my_colum_ as my_colum_",
        "from_unixtime(bi.record_time,'yyyyMMdd') as ds", "cast (topic as string) as platform")
      .filter(x => rowFilter(x))
      .groupByKey(r => (r.getAs[String]("platform"), r.getAs[String]("f_sid"), r.getAs[String]("ds")))
      .flatMapGroupsWithState(OutputMode.Update(), GroupStateTimeout.NoTimeout())(top)
    val query = guide.writeStream
      .outputMode(OutputMode.Update())
      .format("console")
      .option("checkpointLocation", "/ck")
      .option("truncate", "false")
      .trigger(Trigger.ProcessingTime("10 seconds"))
      .start()
    query.awaitTermination()
  }



  def session(): SparkSession = {
    val os = System.getProperty("os.name")
    val master = if (os.startsWith("Windows")) "local[*]" else "yarn-cluster"
    val appName = this.getClass.getSimpleName
    val session = SparkSession.builder()
      .appName(appName)
      .master(master)
      .getOrCreate()
    session
  }


  def rowFilter(row: Row): Boolean = {
    if (row.getAs[String]("f_sid") == null || row.getAs[String]("ds") == null) false else true
  }


  def top(key: (String, String, String), value: Iterator[Row], state: GroupState[State]): Iterator[Result] = {
    val oldState = if (state.exists) state.get else State(key, mutable.Map[String, Long]())
    val latest = oldState.latest
    val results: ListBuffer[Result] = ListBuffer[Result]()
    value.toList
      .groupBy(_.getAs[String]("user_id"))
      .foreach(f => {
        val userID = f._1
        val groRows = f._2
        val sortedRows = groRows.sortWith(_.getAs[Long]("record_time") > _.getAs[Long]("record_time"))
        val top = sortedRows.head
        val guideSid = top.getAs[String]("guide_sid")
        val cur = top.getAs[Long]("record_time")
        val pre = latest.getOrElse[Long](userID, 0L)
        if (pre < cur) {
          latest.update(userID, cur)
          println("pre:" + pre + "cur:" + cur)
          results += Result(key._1, key._2, key._3, userID, guideSid, new Timestamp(cur * 1000))
        }
      })
    val newState = State(key, latest)
    state.update(newState)
    results.toIterator
  }

}

这个是读取kafka的



import java.util.Properties

import com.youkia.utils.ServiceConf
import org.apache.spark.sql.{DataFrame, SparkSession}


object Source {
  private val KAFKA_PROPERTIES: Properties = ServiceConf.getProperties("/conf/kafka.properties")

  /**
   * kafka数据源
   *
   * @param sparkSession
   * @return
   */
  def kafkaSource(sparkSession: SparkSession): DataFrame = {
    val map = new scala.collection.mutable.HashMap[String, String]
    val ite = KAFKA_PROPERTIES.entrySet().iterator()
    while (ite.hasNext) {
      val next = ite.next()
      map += (next.getKey.toString -> next.getValue.toString)
    }
    sparkSession.readStream
      .format("kafka")
      .options(map)
      .load()
  }

}

说明

大致功能就是从kafka中实时消费数据，数据是JSON字符串格式，解析之后，按platform、f_sid、ds、user_id分组，只保留组内每个用户当天record_time最新的数据，因为实时数据不能保证事件时间的有序性，我们必须对数据进行全局排序后取其TOP1。

flatMapGroupWithState、mapGroupWithState是Structured Streaming中的full state算子，跟spark streaming中updateStateByKey、mapWithState类似，就是把之前的聚合结果都存在state中，等每次新数据过来时更新state。

mapGroupsWithState和flatMapGroupsWithState之间的主要区别在于，前者允许函数返回一个且仅返回一条记录，而后者允许函数返回任意数量的记录（包括无记录）。此外， flatMapGroupsWithState有 Append、Update两种输出模式，即：追加或者更新。另外，两者都可使用GroupStateTimeout来设置数据超时类型。

使用GroupStateTimeout的注意事项：

超时类型是所有组中的全局参数（在[map|flatMap]GroupsWithState中设置超时类型，而确切的超时持续时间/时间戳可以通过调用state.setTimeout...()来为每个组配置。
超时可以基于处理时间（即 GroupStateTimeout.ProcessingTimeTimeout）或事件时间（即 GroupStateTimeout.EventTimeTimeout）。
使用EventTimeTimeout，用户必须设置使用Dataset.withWatermark()来设置事件时间水印。
当某个组发生超时时，将为该组调用没有值的函数，并将其 GroupState.hasTimedOut()设置为true。
每次在组上调用该函数（即：当该组有新数据或该组已超时）时，都会重置超时。