在Structured Streaming下面我们如果对一个Dataset,重复的去执行聚合操作,可能会报以下错误:
Multiple streaming aggregations are not supported with streaming DataFrames/Datasets
Multiple mapGroupsWithStates are not supported on a streaming DataFrames/Datasets
Mixing mapGroupsWithStates and flatMapGroupsWithStates are not supported on a streaming DataFrames/Datasets
当出现以上错误的时候,在很长一段时间内,我都认为spark压根就不支持多个聚合链,直到昨天我看源码终于发现了这么一句话:
Multiple flatMapGroupsWithStates are not supported when they are not all in append mode or the output mode is not append on a streaming DataFrames/Datasets
看到这我发现,多个flatMapGroupsWithStates连着进行聚合的时候,只能在append模式下出现。所以这也就是说是可以实现多个连续聚合操作的,只是说只能多个flatMapGroupsWithStates并且是append输出模式就可以了。
直接看代码吧:
object PaasPvUv {
val dateFormat: FastDateFormat = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
def main(args: Array[String]): Unit = {
val conf = new SparkConf().set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.registerKryoClasses(Array(classOf[AidUvState], classOf[Roaring64Bitmap]))
val spark = SparkSession.builder
.appName("StructuredStreamingTest")
.master("local[*]")
.config(conf)
.config("spark.sql.shuffle.partitions", "1")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
// Create DataFrame representing the stream of input lines from connection to localhost:9999
val lines = spark.readStream
.format("socket")
.option("host", "192.168.1.171")
.option("port", 9999)
.load()
import spark.implicits._
// implicit val stateEncoder = org.apache.spark.sql.Encoders.bean(classOf[AidUvState])
// implicit val resultEncoder = org.apache.spark.sql.Encoders.bean(classOf[AidUvResult])
val query = lines.as[String].map(line => {
println("line===>", line)
val arr = line.split(";")
PaasInput(arr(0), arr(1), arr(2), arr(3), this.formatTime2TimeStamp(arr(4)))
}).as[PaasInput]
// 这里必须是要有WaterMark的
.withWatermark("inputtime", "1 minutes")
.groupByKey(line=>(line.app_id, line.aid))
.flatMapGroupsWithState(OutputMode.Append(), GroupStateTimeout.EventTimeTimeout())(updateAcrossEvents2)
// 这里必须是要有WaterMark的
.withWatermark("inputtime", "1 minutes")
.groupByKey(_.app_id)
.flatMapGroupsWithState(OutputMode.Append(), GroupStateTimeout.EventTimeTimeout())(updateAppIdAcrossEvent2)
.writeStream
.outputMode(OutputMode.Append())
.queryName("Aid_Uv_Count")
.format("console")
.start()
query.awaitTermination()
}
def updateAppIdAcrossEvent2(appId: String, events: Iterator[AidUvResult], oldState: GroupState[AppIdState]): Iterator[AppIdResult] = {
var state = if(oldState.exists) oldState.get else AppIdState(appId, new Timestamp(0L), 0L, 0L)
if (oldState.hasTimedOut) {
val state = oldState.get
oldState.remove()
Iterator(AppIdResult(appId, state.uv, state.pv, state.inputtime, true))
} else {
var timemax: Long = state.inputtime.getTime
for (event <- events) {
timemax = math.max(timemax, event.inputtime.getTime)
state = state.copy(app_id = event.app_id, inputtime = new Timestamp(timemax), uv = state.uv+event.uv, pv=state.pv+event.pv)
}
oldState.update(state)
oldState.setTimeoutTimestamp(timemax, "5 seconds")
val ret = AppIdResult(app_id = appId, state.uv, state.pv, new Timestamp(timemax), false)
println("ret:", ret)
Iterator(ret)
}
}
def updateAcrossEvents2(groupKey: (String, String), events: Iterator[PaasInput], oldState: GroupState[AidUvState]): Iterator[AidUvResult] = {
var state = if(oldState.exists) oldState.get else AidUvState(groupKey._1, groupKey._2, new Timestamp(0L), new mutable.MutableList[String], new mutable.MutableList[String])
if (oldState.hasTimedOut) {
val state = oldState.get
oldState.remove()
Iterator(AidUvResult(groupKey._1, groupKey._2, state.uids.size, state.sessions.size, state.inputtime, true))
} else {
var timemax: Long = state.inputtime.getTime
for (event <- events) {
timemax = math.max(timemax, event.inputtime.getTime)
if(!state.uids.contains(event.uid)){
state = state.copy(app_id = event.app_id, aid = event.aid, inputtime = new Timestamp(timemax), uids = state.uids :+ event.uid, state.sessions)
}
if(!state.sessions.contains(event.s)){
state = state.copy(app_id = event.app_id, aid = event.aid, inputtime = new Timestamp(timemax), state.uids, sessions = state.sessions :+ event.s)
}
state = state.copy(app_id = event.app_id, aid = event.aid, inputtime = new Timestamp(timemax), state.uids, state.sessions)
}
oldState.update(state)
oldState.setTimeoutTimestamp(timemax, "5 seconds")
val ret = AidUvResult(app_id = groupKey._1, aid = groupKey._2, state.uids.size, state.sessions.size, new Timestamp(timemax), false)
println("ret:", ret)
Iterator(ret)
}
}
def formatTime2TimeStamp(timeStr: String): Timestamp ={
val milliSeconds = dateFormat.parse(timeStr).getTime
val timestamp = new Timestamp(milliSeconds)
timestamp
}
}
然后输入数据以下:
9320432d;lss_0bc36d14;9320432d_4444444441520999402417;9320432d_444444444;2021-03-15 16:19:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402098;9320432d_444444444;2021-03-15 16:20:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402098;9320432d_444444444;2021-03-15 16:21:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402417;9320432d_444444444;2021-03-15 16:22:00
9320432d;lss_0bc36d14;9320432d_4444444441520999402098;9320432d_444444444;2021-03-15 16:23:00
9320432d;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:24:00
9320432d;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:25:00
9320432d;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:26:00
93204234;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:26:00
93204234;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:27:00
93204234;lss_0bc36987;9320432d_4444444441520991234556;9320432d_444444444;2021-03-15 16:28:00
输出结果如下:
以上就是一个完整的Structured Streaming下的一个连续的聚合链的实现,不过是Append模式的。以后出报异常信息了还是得多多找找源码,有可能就发现什么新的解决方案了。