(一)前言-回顾WaterMaker+AllowedLateness
-
Flink提供了allowedLateness方法,allowedLateness只针对Event Time有效,在WaterMaker一定程度允许延迟数据的情况下,进一步了处理乱序乱序数据的问题!
-
allowedLateness主要是改变了窗口的销毁时机与对上次该窗口数据做一个缓存操作,但这可能使得窗口再次(多次)被触发,相当于对前一次窗口的窗口的不断修正(累加计算或者累加撤回计算);
-
注意再次触发窗口时,状态值会累加,要考虑state在计算时的去重问题。
-
注意再次触发窗口时,同一个key的同一个window结果可能被sink多次(触发多少次补偿计算则会输出多少次sink),因此sink接收端需要注意去重问题
单纯使用Window+AllowedLateness可在延迟数据到来时,再次触发Window计算,但是呢,延迟比较严重的数据仍然还是丢掉了…为此,Flink还提供了一个侧位输出机制(SideOutPutTag),支持将严重迟到的数据收集起来,作为一个全新的数据流…我们可以根据此严重迟到数据流再额外做自己的业务逻辑处理(例如 根据数据KEY打上标签,然后启动定时补偿重新将该KEY所有数据推入Flink再次计算等等。
(二)AllowedLateness+SideOutPutTag
可以看到,除了AllowedLateness机制导致的WIndow重复计算外,对高延迟的数据也做了额外的收集,这样就避免了数据的丢失了!
(三)DEMO
package com.leilei;
import cn.hutool.core.util.RandomUtil;
import com.alibaba.fastjson.JSON;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import java.time.Duration;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
/**
* @author lei
* @version 1.0
* @date 2021/3/17 20:49
* @desc flink 使用 watermaker水位线 +allowedLateness+sideOutPutTag(侧道输出) 即可允许短时间延迟又可手机延迟较大的数据
*/
public class AllowedLatenessSideOutPutTag {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.STREAMING);
//准备数据
DataStreamSource<Location> locationSource = env.addSource(new LocationSource());
//设置水位线 允许延迟为5秒
SingleOutputStreamOperator<Location> watermarks = locationSource.
assignTimestampsAndWatermarks(WatermarkStrategy.
//水位线延迟时间设为5 即接受5秒钟内的延迟数据
<Location>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner((event, timestamp) -> event.getDevTime()));
// 设置延迟定位存储
OutputTag<Location> delayLocation = new OutputTag<>("delayLocation", TypeInformation.of(Location.class));
//时间滚动窗口 十秒计算一次
WindowedStream<Location, Integer, TimeWindow> window = watermarks.keyBy(Location::getVehicleId)
.window(TumblingEventTimeWindows.of(Time.seconds(10)))
// 额外允许延迟的时间 5s
.allowedLateness(Time.seconds(5))
// 大于五秒的数据存放在 delayLocation中
.sideOutputLateData(delayLocation);
SingleOutputStreamOperator<String> source = window.apply(new AlarmCalcWindow());
source.print("正常数据>>");
DataStream<Location> delayLocationSource = source.getSideOutput(delayLocation);
//todo delayLocationSource 算子转换操作,这里直接sink
delayLocationSource.printToErr("延迟数据>>");
env.execute();
}
public static class LocationSource implements SourceFunction<Location> {
Boolean flag = true;
@Override
public void run(SourceContext<Location> ctx) throws Exception {
while (flag) {
int vehicleId = 1;
Location location = Location.builder()
.vehicleId(vehicleId)
.plate("川A000" + vehicleId)
.color("黄")
.date(Integer.parseInt(LocalDate.now().format(DateTimeFormatter.BASIC_ISO_DATE)))
.gpsSpeed(RandomUtil.randomInt(90, 100))
.limitSpeed(RandomUtil.randomInt(88, 95))
.devTime(System.currentTimeMillis() - RandomUtil.randomInt(5, 40) * 1000)
.build();
ctx.collect(location);
// System.out.println("初始数据:" + location + new Date(location.getDevTime()));
Thread.sleep(2000);
}
}
@Override
public void cancel() {
flag = false;
}
}
/**
* 自定义窗口
*/
public static class AlarmCalcWindow implements WindowFunction<Location, String, Integer, TimeWindow> {
@Override
public void apply(Integer key, TimeWindow window, Iterable<Location> input, Collector<String> out) {
System.out.println(String.format("窗口执行--开始时间:%s-------结束时间:%s", window.getStart(), window.getEnd()));
//todo 迭代器元素根据时间排序
for (Location location : input) {
// System.out.println("数据计算:" + location + new Date(location.getDevTime()));
out.collect(JSON.toJSONString(location));
}
}
}
}