聚合函数(AggregateFunctionTest)
package com.liao.chapter06
import com.liao.chapter05.{ClickSource, Event}
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
object AggregateFunctionTest {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = env.addSource(new ClickSource)
.assignAscendingTimestamps(_.timestamp)
stream.keyBy(data =>true)
.window( SlidingEventTimeWindows.of(Time.seconds(10),Time.seconds(2)))
.aggregate(new PvUv)
.print()
env.execute()
}
class PvUv extends AggregateFunction[Event, (Long,Set[String]), Double] {
override def createAccumulator(): (Long, Set[String]) = (0L, Set[String]())
override def add(in: Event, acc: (Long, Set[String])): (Long, Set[String]) = (acc._1+1, acc._2 + in.user)
override def getResult(acc: (Long, Set[String])): Double = acc._1.toDouble / acc._2.size
override def merge(acc: (Long, Set[String]), acc1: (Long, Set[String])): (Long, Set[String]) = ???
}
}
全窗口函数(FullWindowFunctionTest)
package com.liao.chapter06
import com.liao.chapter05.{ClickSource, Event}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
object FullWindowFunctionTest {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = env.addSource(new ClickSource)
.assignAscendingTimestamps(_.timestamp)
stream.keyBy( data => "key" )
.window( TumblingEventTimeWindows.of(Time.seconds(10)))
.process(new UvCountByWindow)
.print()
env.execute()
}
class UvCountByWindow extends ProcessWindowFunction[Event, String,String ,TimeWindow] {
override def process(key: String, context: Context, elements: Iterable[Event], out: Collector[String]): Unit = {
var userSet = Set[String]()
elements.foreach( userSet += _.user )
val uv = userSet.size
val windowEnd = context.window.getEnd
val windowStart = context.window.getStart
out.collect(s"窗口 $windowStart ~ $windowEnd 的uv值为: $uv")
}
}
}
ProcessLateDataExample
package com.liao.chapter06
import com.liao.chapter05.Event
import com.liao.chapter06.UrlViewCountExample.{UrlViewCountAgg, UrlViewCountResult}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.assigners.{SlidingEventTimeWindows, TumblingEventTimeWindows}
import org.apache.flink.streaming.api.windowing.time.Time
object ProcessLateDataExample {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = env.socketTextStream("hadoop002", 7777)
.map(
data => {
val fields = data.split(",")
Event(fields(0).trim, fields(1).trim, fields(2).trim.toLong)
})
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[Event](Time.seconds(5)) {
override def extractTimestamp(t: Event): Long = t.timestamp
})
val outputTag = OutputTag[Event]("late-data")
val result = stream.keyBy(_.url)
.window(TumblingEventTimeWindows.of(Time.seconds(10)))
.allowedLateness(Time.minutes(1))
.sideOutputLateData(outputTag)
.aggregate(new UrlViewCountAgg, new UrlViewCountResult)
result.print("result")
stream.print("input")
result.getSideOutput(outputTag).print("late data")
env.execute()
}
}
UrlViewCountExample
package com.liao.chapter06
import com.liao.chapter05.{ClickSource, Event}
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
case class UrlViewCount(url: String ,count: Long,windowStart :Long,windowEnd: Long)
object UrlViewCountExample {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = env.addSource(new ClickSource)
.assignAscendingTimestamps(_.timestamp)
stream.keyBy(_.url)
.window(SlidingEventTimeWindows.of(Time.seconds(10),Time.seconds(5)))
.aggregate(new UrlViewCountAgg, new UrlViewCountResult)
.print()
env.execute()
}
class UrlViewCountAgg extends AggregateFunction[Event,Long,Long]{
override def createAccumulator(): Long = 0L
override def add(in: Event, acc: Long): Long = acc + 1
override def getResult(acc: Long): Long = acc
override def merge(acc: Long, acc1: Long): Long = ???
}
class UrlViewCountResult extends ProcessWindowFunction[Long,UrlViewCount,String,TimeWindow] {
override def process(url: String, context: Context, elements: Iterable[Long], out: Collector[UrlViewCount]): Unit = {
val count = elements.iterator.next()
val start = context.window.getStart
val end = context.window.getEnd
out.collect(UrlViewCount(url,count,start,end))
}
}
}
WatermarkTest(水印)
package com.liao.chapter06
import java.lang
import com.liao.chapter05.Event
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
object WatermarkTest {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream :DataStream[Event]= env.socketTextStream("hadoop002",7777)
.map( data=> {
val fields = data.split(",")
Event(fields(0).trim,fields(1).trim,fields(2).toLong)
} )
stream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[Event](Time.seconds(5)) {
override def extractTimestamp(t: Event): Long = t.timestamp
})
.keyBy(_.user)
.window(TumblingEventTimeWindows.of(Time.seconds(10)))
.process( new WatermarkWindowResult )
.print()
env.execute()
}
class WatermarkWindowResult extends ProcessWindowFunction[Event,String,String,TimeWindow] {
override def process(user: String, context: Context, elements: Iterable[Event], out: Collector[String]): Unit = {
val start = context.window.getStart
val end = context.window.getEnd
val count = elements.size
val currentWatermark = context.currentWatermark
out.collect(s"窗口 $start ~ $end , 用户 $user 的活跃度为: $count, 水位线现在位于: $currentWatermark")
}
}
}
WindowTest
package com.liao.chapter06
import com.liao.chapter05.Event
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.assigners.{EventTimeSessionWindows, SlidingEventTimeWindows, TumblingEventTimeWindows}
import org.apache.flink.streaming.api.windowing.time.Time
object WindowTest {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val lineDataStream = env.socketTextStream("hadoop002", 7777)
val stream = lineDataStream.map(data => {
val arr = data.split(",")
Event(arr(0),arr(1),arr(2).toLong)
})
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[Event](Time.milliseconds(30)) {
override def extractTimestamp(t: Event): Long = t.timestamp *1000L
})
val resultStream = stream
.map( data => (data.user,data.timestamp) )
.keyBy( _._1 )
.timeWindow(Time.seconds(15))
.allowedLateness(Time.minutes(1))
.sideOutputLateData(new OutputTag[(String, Long)]("late"))
.minBy(1)
resultStream.print()
env.execute("window test")
}
}
WindowTest2
package com.liao.chapter06
import com.liao.chapter05.{ClickSource, Event}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.assigners.{EventTimeSessionWindows, SlidingEventTimeWindows, TumblingEventTimeWindows, TumblingProcessingTimeWindows}
import org.apache.flink.streaming.api.windowing.time.Time
object WindowTest2 {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = env.addSource(new ClickSource)
.assignAscendingTimestamps(_.timestamp)
stream.map(data => (data.user,1))
.keyBy(_._1)
.window( TumblingEventTimeWindows.of(Time.seconds(5)))
.reduce( (state, data) => (data._1,state._2+ data._2) )
.print()
env.execute()
}
}