前言
最近正在看spark的Structured Streaming,正好工作中需要在流式计算中实现一个类似分组排序的功能,现在使用Structured Streaming来实现。
代码
import java.sql.Timestamp
import com.xxx.source.Source
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode, Trigger}
import org.apache.spark.sql.{Row, SparkSession}
import scala.collection.mutable.ListBuffer
import scala.collection.{Iterator, mutable}
import org.json4s._
import org.json4s.jackson.{Json, Serialization}
case class State(key: (String, String, String), latest: mutable.Map[String, Long])
case class Result(platform: String, fSid: String, ds: String, userId: String, guide_sid: String, recordTime: Timestamp)
object Top {
def main(args: Array[String]): Unit = {
val spark = session()
spark.sparkContext.setLogLevel("WARN")
import spark.implicits._
val dataFrame = Source.kafkaSource(spark)
val scheme = "user_id String,lv String,guide_sid String,my_colum_ String,vip String,record_time Long,f_sid String"
val guide = dataFrame
.selectExpr(s"from_json(cast (value as string),'$scheme') as bi", "topic")
.selectExpr("bi.user_id as user_id", "bi.lv as lv", "bi.guide_sid as guide_sid", "bi.vip as vip",
"bi.record_time as record_time", "bi.f_sid as f_sid", "bi.my_colum_ as my_colum_",
"from_unixtime(bi.record_time,'yyyyMMdd') as ds", "cast (topic as string) as platform")
.filter(x => rowFilter(x))
.groupByKey(r => (r.getAs[String]("platform"), r.getAs[String]("f_sid"), r.getAs[String]("ds")))
.flatMapGroupsWithState(OutputMode.Update(), GroupStateTimeout.NoTimeout())(top)
val query = guide.writeStream
.outputMode(OutputMode.Update())
.format("console")
.option("checkpointLocation", "/ck")
.option("truncate", "false")
.trigger(Trigger.ProcessingTime("10 seconds"))
.start()
query.awaitTermination()
}
def session(): SparkSession = {
val os = System.getProperty("os.name")
val master = if (os.startsWith("Windows")) "local[*]" else "yarn-cluster"
val appName = this.getClass.getSimpleName
val session = SparkSession.builder()
.appName(appName)
.master(master)
.getOrCreate()
session
}
def rowFilter(row: Row): Boolean = {
if (row.getAs[String]("f_sid") == null || row.getAs[String]("ds") == null) false else true
}
def top(key: (String, String, String), value: Iterator[Row], state: GroupState[State]): Iterator[Result] = {
val oldState = if (state.exists) state.get else State(key, mutable.Map[String, Long]())
val latest = oldState.latest
val results: ListBuffer[Result] = ListBuffer[Result]()
value.toList
.groupBy(_.getAs[String]("user_id"))
.foreach(f => {
val userID = f._1
val groRows = f._2
val sortedRows = groRows.sortWith(_.getAs[Long]("record_time") > _.getAs[Long]("record_time"))
val top = sortedRows.head
val guideSid = top.getAs[String]("guide_sid")
val cur = top.getAs[Long]("record_time")
val pre = latest.getOrElse[Long](userID, 0L)
if (pre < cur) {
latest.update(userID, cur)
println("pre:" + pre + "cur:" + cur)
results += Result(key._1, key._2, key._3, userID, guideSid, new Timestamp(cur * 1000))
}
})
val newState = State(key, latest)
state.update(newState)
results.toIterator
}
}
这个是读取kafka的
import java.util.Properties
import com.youkia.utils.ServiceConf
import org.apache.spark.sql.{DataFrame, SparkSession}
object Source {
private val KAFKA_PROPERTIES: Properties = ServiceConf.getProperties("/conf/kafka.properties")
/**
* kafka数据源
*
* @param sparkSession
* @return
*/
def kafkaSource(sparkSession: SparkSession): DataFrame = {
val map = new scala.collection.mutable.HashMap[String, String]
val ite = KAFKA_PROPERTIES.entrySet().iterator()
while (ite.hasNext) {
val next = ite.next()
map += (next.getKey.toString -> next.getValue.toString)
}
sparkSession.readStream
.format("kafka")
.options(map)
.load()
}
}
说明
大致功能就是从kafka中实时消费数据,数据是JSON字符串格式,解析之后,按platform、f_sid、ds、user_id分组,只保留组内每个用户当天record_time最新的数据,因为实时数据不能保证事件时间的有序性,我们必须对数据进行全局排序后取其TOP1。
flatMapGroupWithState、mapGroupWithState是Structured Streaming中的full state算子,跟spark streaming中updateStateByKey、mapWithState类似,就是把之前的聚合结果都存在state中,等每次新数据过来时更新state。
mapGroupsWithState
和flatMapGroupsWithState
之间的主要区别在于,前者允许函数返回一个且仅返回一条记录,而后者允许函数返回任意数量的记录(包括无记录)。此外, flatMapGroupsWithState
有 Append
、Update两种输出模式
,即:追加或者更新。另外,两者都可使用GroupStateTimeout来设置数据超时类型。
使用GroupStateTimeout
的注意事项:
- 超时类型是所有组中的全局参数(在
[map|flatMap]GroupsWithState中设置超时类型
,而确切的超时持续时间/时间戳可以通过调用state.setTimeout...()
来为每个组配置。 - 超时可以基于处理时间(即
GroupStateTimeout.ProcessingTimeTimeout
)或事件时间(即GroupStateTimeout.EventTimeTimeout
)。 - 使用
EventTimeTimeout
,用户必须设置使用Dataset.withWatermark()来设置
事件时间水印。 - 当某个组发生超时时,将为该组调用没有值的函数,并将其
GroupState.hasTimedOut()
设置为true。 - 每次在组上调用该函数(即:当该组有新数据或该组已超时)时,都会重置超时。