大部分的DataStream API的算子的输出是单一输出,也就是某种数据类型的流。除了split算子,可以将一条流分成多条流,这些流的数据类型也都相同。process function的side outputs功能可以产生多条流,并且这些流的数据类型可以不一样。一个side output可以定义为OutputTag[X]对象,X是输出流的数据类型。process function可以通过Context对象发射一个事件到一个或者多个side outputs。
一 数据源
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//设置时间语义 时间发生时间
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
val socketSource: DataStream[String] = env.socketTextStream("localhost", 7777)
val mapStream: DataStream[SensorReading] = socketSource
.map(data => {
val split: Array[String] = data.split("\\W+")
SensorReading(split(0).trim, split(1).trim.toLong, split(2).trim.toDouble)
})
//对数据流进行分流处理
val tmpStageStream: DataStream[SensorReading] = mapStream.process(new TempStageProcess())
tmpStageStream.print("main");
val lowStream: DataStream[(String, Double)] = tmpStageStream.getSideOutput(new OutputTag[(String, Double)]("low-tmp"))
val highStream: DataStream[(String, Double)] = tmpStageStream.getSideOutput(new OutputTag[(String, Double)]("high-tmp"))
lowStream.print("low")
highStream.print("high")
env.execute()
}
二 分流
class TempStageProcess() extends ProcessFunction[SensorReading,SensorReading]{
// 定义侧输出流
lazy val lowTmp:OutputTag[(String,Double)] = new OutputTag[(String, Double)]("low-tmp");
lazy val HighTmp:OutputTag[(String,Double)] = new OutputTag[(String, Double)]("high-tmp");
//处理数据
override def processElement(value: SensorReading, context: ProcessFunction[SensorReading, SensorReading]#Context, collector: Collector[SensorReading]): Unit = {
if(value.tm<10){
context.output(lowTmp,(value.id,value.tm))
} else if(value.tm>70){
context.output(HighTmp,(value.id,value.tm))
}else{
collector.collect(value)
}
}
}
三 输出结果
main> SensorReading(sensor_1,1547718191,20.0)
main> SensorReading(sensor_1,1547718192,10.0)
low> (sensor_1,1.0)
high> (sensor_1,80.0)