自定义trigger的主要目的是为了等待数据到齐:
代码如下; flink版本1.6
-
public class WatermarkTest {
-
public static void main(String[] args) throws Exception {
-
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
-
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
-
Properties properties = new Properties();
-
properties.setProperty("bootstrap.servers", GlobalConstants.KAFKA_BROKER);
-
properties.setProperty("group.id", "crm_stream_window");
-
properties.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
-
DataStream<String> stream =
-
env.addSource(new FlinkKafkaConsumer011<>("test", new SimpleStringSchema(), properties));
-
DataStream<Tuple3<String, Long, Integer>> inputMap = stream.map(new MapFunction<String, Tuple3<String, Long, Integer>>() {
-
private static final long serialVersionUID = -8812094804806854937L;
-
@Override
-
public Tuple3<String, Long, Integer> map(String value) throws Exception {
-
return new Tuple3<>(value.split("\\W+")[0], Long.valueOf(value.split("\\W+")[1]), Integer.valueOf(value.split("\\W+")[2]));
-
}
-
});
-
DataStream<Tuple3<String, Long, Integer>> watermark =
-
inputMap.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks<Tuple3<String, Long, Integer>>() {
-
private static final long serialVersionUID = 8252616297345284790L;
-
Long currentMaxTimestamp = 0L;
-
Long maxOutOfOrderness = 2000L;//最大允许的乱序时间是10s
-
Watermark watermark = null;
-
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
-
@Nullable
-
@Override
-
public Watermark getCurrentWatermark() {
-
watermark = new Watermark(currentMaxTimestamp - maxOutOfOrderness);
-
return watermark;
-
}
-
@Override
-
public long extractTimestamp(Tuple3<String, Long, Integer> element, long previousElementTimestamp) {
-
Long timestamp = element.f1;
-
currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp);
-
System.out.println("timestamp : " + element.f1 + "|" + format.format(element.f1) + " currentMaxTimestamp : " + currentMaxTimestamp + "|" + format.format(currentMaxTimestamp) + "," + " watermark : " + watermark.getTimestamp() + "|" + format.format(watermark.getTimestamp()));
-
return timestamp;
-
}
-
});
-
OutputTag<Tuple3<String, Long, Integer>> lateOutputTag = new OutputTag<Tuple3<String, Long, Integer>>("late-data") {
-
private static final long serialVersionUID = -1552769100986888698L;
-
};
-
SingleOutputStreamOperator<String> resultStream = watermark
-
.keyBy(0)
-
.window(TumblingEventTimeWindows.of(Time.seconds(3)))
-
.trigger(new Trigger<Tuple3<String, Long, Integer>, TimeWindow>() {
-
private static final long serialVersionUID = 2742133264310093792L;
-
ValueStateDescriptor<Integer> sumStateDescriptor = new ValueStateDescriptor<Integer>("sum", Integer.class);
-
@Override
-
public TriggerResult onElement(Tuple3<String, Long, Integer> element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception {
-
ValueState<Integer> sumState = ctx.getPartitionedState(sumStateDescriptor);
-
if (null == sumState.value()) {
-
sumState.update(0);
-
}
-
sumState.update(element.f2 + sumState.value());
-
if (sumState.value() >= 2) {
-
//这里可以选择手动处理状态
-
// 默认的trigger发送是TriggerResult.FIRE 不会清除窗口数据
-
return TriggerResult.FIRE_AND_PURGE;
-
}
-
return TriggerResult.CONTINUE;
-
}
-
@Override
-
public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
-
return TriggerResult.CONTINUE;
-
}
-
@Override
-
public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
-
return TriggerResult.CONTINUE;
-
}
-
@Override
-
public void clear(TimeWindow window, TriggerContext ctx) throws Exception {
-
System.out.println("清理窗口状态 窗口内保存值为" + ctx.getPartitionedState(sumStateDescriptor).value());
-
ctx.getPartitionedState(sumStateDescriptor).clear();
-
}
-
})
-
//如果使用allowedLateness会有重复计算的效果
-
//默认的trigger情况下
-
// 在event time>window_end_time+watermark+allowedLateness时会触发窗口的clear
-
// 后续数据如果属于该窗口而且数据的event_time>watermark-allowedLateness 会触发重新计算
-
//
-
//在使用自定义的trigger情况下
-
//同一个窗口内只要满足要求可以不停的触发窗口数据往下流
-
//在event time>window_end_time+watermark+allowedLateness时会触发窗口clear
-
//后续数据如果属于该窗口而且数据的event_time>watermark-allowedLateness 会触发重新计算
-
//
-
//窗口状态的clear只和时间有关与是否自定义trigger无关
-
.allowedLateness(Time.seconds(3))
-
.sideOutputLateData(lateOutputTag)
-
.apply(new WindowFunction<Tuple3<String, Long, Integer>, String, Tuple, TimeWindow>() {
-
private static final long serialVersionUID = 7813420265419629362L;
-
@Override
-
public void apply(Tuple tuple, TimeWindow window, Iterable<Tuple3<String, Long, Integer>> input, Collector<String> out) throws Exception {
-
for (Tuple3<String, Long, Integer> stringLongTuple2 : input) {
-
System.out.println(stringLongTuple2.f1);
-
}
-
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
-
out.collect("window " + format.format(window.getStart()) + " window " + format.format(window.getEnd()));
-
}
-
});
-
resultStream.print();
-
// resultStream.getSideOutput(lateOutputTag).print();
-
env.execute("window test");
-
}
比较了自定义trigger和默认的trigger在event time的前提下,watermark和allowedLateness对trigger的影响。
默认trigger加allowedLateness: 会导致窗口原来的数据也会触发
-
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : -2000|1970-01-01 07:59:58.000
-
timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000
-
timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000
-
timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000
-
timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000
-
1461756862000
-
1461756863000
-
8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000
-
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756864000|2016-04-27 19:34:24.000
-
1461756862000
-
1461756863000
-
1461756862000
-
8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000
自定义trigger加allowedLateness: 会将落后的数据直接往下发送
-
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : -2000|1970-01-01 07:59:58.000
-
timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000
-
1461756862000
-
1461756863000
-
8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000
-
timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000
-
timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000
-
timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000
-
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756864000|2016-04-27 19:34:24.000
-
1461756862000
-
8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000
可以发现两者的不同,默认的trigger会将之前窗口中的数据一起发出,但是自定义的trigger不会将之前的数据发送,而是单独将落后的数据往后发送了。避免数据的重复的问题(trigger发送数据的方式不同)。
默认trigger加allowedLateness: 会导致窗口原来的数据也会触发
-
timestamp : 1461756861000|2016-04-27 19:34:21.000 currentMaxTimestamp : 1461756861000|2016-04-27 19:34:21.000, watermark : -2000|1970-01-01 07:59:58.000
-
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : 1461756859000|2016-04-27 19:34:19.000
-
timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000
-
timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000
-
timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000
-
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756863000|2016-04-27 19:34:23.000
-
1461756861000
-
1461756862000
-
1461756863000
-
1461756862000
-
8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000
-
timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000
-
timestamp : 1461756867000|2016-04-27 19:34:27.000 currentMaxTimestamp : 1461756867000|2016-04-27 19:34:27.000, watermark : 1461756864000|2016-04-27 19:34:24.000
-
timestamp : 1461756868000|2016-04-27 19:34:28.000 currentMaxTimestamp : 1461756868000|2016-04-27 19:34:28.000, watermark : 1461756865000|2016-04-27 19:34:25.000
-
timestamp : 1461756869000|2016-04-27 19:34:29.000 currentMaxTimestamp : 1461756869000|2016-04-27 19:34:29.000, watermark : 1461756866000|2016-04-27 19:34:26.000
-
清理窗口状态 窗口内保存值为4
默认trigger clear()的调用时间: 在29s的时候触发的默认的clear方法,默认执行的类名(EventTimeTrigger)
-
timestamp : 146175682000|1974-08-20 04:21:22.000 currentMaxTimestamp : 146175682000|1974-08-20 04:21:22.000, watermark : -2000|1970-01-01 07:59:58.000
-
timestamp : 146175683000|1974-08-20 04:21:23.000 currentMaxTimestamp : 146175683000|1974-08-20 04:21:23.000, watermark : 146175680000|1974-08-20 04:21:20.000
-
timestamp : 146175684000|1974-08-20 04:21:24.000 currentMaxTimestamp : 146175684000|1974-08-20 04:21:24.000, watermark : 146175681000|1974-08-20 04:21:21.000
-
timestamp : 146175685000|1974-08-20 04:21:25.000 currentMaxTimestamp : 146175685000|1974-08-20 04:21:25.000, watermark : 146175682000|1974-08-20 04:21:22.000
-
timestamp : 146175686000|1974-08-20 04:21:26.000 currentMaxTimestamp : 146175686000|1974-08-20 04:21:26.000, watermark : 146175683000|1974-08-20 04:21:23.000
-
146175682000
-
146175683000
-
8> window 1974-08-20 04:21:21.000 window 1974-08-20 04:21:24.000
-
timestamp : 146175687000|1974-08-20 04:21:27.000 currentMaxTimestamp : 146175687000|1974-08-20 04:21:27.000, watermark : 146175684000|1974-08-20 04:21:24.000
-
timestamp : 146175688000|1974-08-20 04:21:28.000 currentMaxTimestamp : 146175688000|1974-08-20 04:21:28.000, watermark : 146175685000|1974-08-20 04:21:25.000
-
timestamp : 146175689000|1974-08-20 04:21:29.000 currentMaxTimestamp : 146175689000|1974-08-20 04:21:29.000, watermark : 146175686000|1974-08-20 04:21:26.000
默认trigger clear()的调用时间:
-
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : -2000|1970-01-01 07:59:58.000
-
timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000
-
1461756862000
-
1461756863000
-
timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000
-
timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000
-
timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000
-
timestamp : 1461756867000|2016-04-27 19:34:27.000 currentMaxTimestamp : 1461756867000|2016-04-27 19:34:27.000, watermark : 1461756864000|2016-04-27 19:34:24.000
-
timestamp : 1461756868000|2016-04-27 19:34:28.000 currentMaxTimestamp : 1461756868000|2016-04-27 19:34:28.000, watermark : 1461756865000|2016-04-27 19:34:25.000
-
timestamp : 1461756869000|2016-04-27 19:34:29.000 currentMaxTimestamp : 1461756869000|2016-04-27 19:34:29.000, watermark : 1461756866000|2016-04-27 19:34:26.000
-
清理窗口状态 窗口内保存值为2
通过自定义trigger和默认的trigger的比较,可以发现clear()方法的调用只和时间有关
当event time>window_end_time+watermark+allowedLateness时调用
进入event time默认的trigger看看:
-
@PublicEvolving
-
public class EventTimeTrigger extends Trigger<Object, TimeWindow> {
-
private static final long serialVersionUID = 1L;
-
private EventTimeTrigger() {}
-
@Override
-
public TriggerResult onElement(Object element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception {
-
if (window.maxTimestamp() <= ctx.getCurrentWatermark()) {
-
// if the watermark is already past the window fire immediately
-
return TriggerResult.FIRE;
-
} else {
-
// 注册一个事件时间的定时器,触发onEventTime
-
ctx.registerEventTimeTimer(window.maxTimestamp());
-
return TriggerResult.CONTINUE;
-
}
-
}
-
@Override
-
public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) {
-
//可以看到当触发onEventTime方法时只是将数据往下发送了
-
return time == window.maxTimestamp() ?
-
TriggerResult.FIRE :
-
TriggerResult.CONTINUE;
-
}
-
@Override
-
public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
-
return TriggerResult.CONTINUE;
-
}
-
@Override
-
public void clear(TimeWindow window, TriggerContext ctx) throws Exception {
-
//删除事件时间的定时器
-
ctx.deleteEventTimeTimer(window.maxTimestamp());
-
}
-
@Override
-
public boolean canMerge() {
-
return true;
-
}
-
@Override
-
public void onMerge(TimeWindow window,
-
OnMergeContext ctx) {
-
// only register a timer if the watermark is not yet past the end of the merged window
-
// this is in line with the logic in onElement(). If the watermark is past the end of
-
// the window onElement() will fire and setting a timer here would fire the window twice.
-
long windowMaxTimestamp = window.maxTimestamp();
-
if (windowMaxTimestamp > ctx.getCurrentWatermark()) {
-
ctx.registerEventTimeTimer(windowMaxTimestamp);
-
}
-
}
-
@Override
-
public String toString() {
-
return "EventTimeTrigger()";
-
}
-
/**
-
* Creates an event-time trigger that fires once the watermark passes the end of the window.
-
*
-
* <p>Once the trigger fires all elements are discarded. Elements that arrive late immediately
-
* trigger window evaluation with just this one element.
-
*/
-
public static EventTimeTrigger create() {
-
return new EventTimeTrigger();
-
}
-
}
到此有些疑问窗口中元素的清除是在什么类中实现的?何时清除的?(自我理解:按理说是应该在调用clear()方法时清除窗口数据,因为此时窗口结束时间已经比watermark-allowedLateness小了)