netstat -tunpl |grep 9999
kill -9 78728
1.解析 主要要明白 数据是从0 秒开始执行 计算
每10秒计算一次
45 输出30 -40 的数据
50会输出 35 -45 的数据
要想迟到把 31放在45之后 就算迟到
36放在45之后不会迟到
/**
* @author jiasongfan
* @date 2022/6/29
* @apiNote
*/
import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.assigners.{SlidingEventTimeWindows, SlidingProcessingTimeWindows, TumblingEventTimeWindows}
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import java.time.Duration
object Test04 {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
// env.enableCheckpointing(1000)
val text = env.socketTextStream("hdp1", 9999)
//)使用nc输入数据,数据自行模拟。(5分)
//2)读取数据并设置水位线,最大乱序程序为5S,定义出水位线。(5分)
//3)需求1:每5秒钟统计一次,最近5秒钟内,各个单词的数量(考虑数据的延迟)--基于时间的滚动窗口。(5分)
//4)需求2:每5秒钟统计一次,最近10秒钟内,各个单词以及数量(要有迟到数据的输出验证)--基于时间的滑动窗口 (5分)
val v1: DataStream[(String, Long, Int)] = text.map(t => {
val li: Array[String] = t.split(" ")
(li(0), li(1).trim.toLong*1000, 1)
})
val timeDS: DataStream[(String, Long, Int)] = v1.assignTimestampsAndWatermarks(WatermarkStrategy
.forBoundedOutOfOrderness[(String, Long, Int)](Duration.ofSeconds(5))
.withTimestampAssigner(new SerializableTimestampAssigner[(String, Long, Int)] {
override def extractTimestamp(element: (String, Long, Int), recordTimestamp: Long): Long = element._2
}))
val tag1 = new OutputTag[(String, Long, Int)]("late_data")
val win1DS: WindowedStream[(String, Long, Int), String, TimeWindow] = timeDS.keyBy(_._1)
//需求2:每5秒钟统计一次,最近10秒钟内,各个单词以及数量(要有迟到数据的输出验证)--基于时间的滑动窗口 (5分)
.window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
// .window(TumblingEventTimeWindows.of(Time.seconds(5)))
.sideOutputLateData(tag1)
val sumDS: DataStream[(String, Long, Int)] = win1DS.sum(2)
sumDS.print()
sumDS.getSideOutput(tag1)
.print("迟到数据")
env.execute("Window Stream WordCount")
}
}