自定义trigger的主要目的是为了等待数据到齐:
代码如下; flink版本1.6
-
public class WatermarkTest { -
public static void main(String[] args) throws Exception { -
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); -
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); -
Properties properties = new Properties(); -
properties.setProperty("bootstrap.servers", GlobalConstants.KAFKA_BROKER); -
properties.setProperty("group.id", "crm_stream_window"); -
properties.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest"); -
DataStream<String> stream = -
env.addSource(new FlinkKafkaConsumer011<>("test", new SimpleStringSchema(), properties)); -
DataStream<Tuple3<String, Long, Integer>> inputMap = stream.map(new MapFunction<String, Tuple3<String, Long, Integer>>() { -
private static final long serialVersionUID = -8812094804806854937L; -
@Override -
public Tuple3<String, Long, Integer> map(String value) throws Exception { -
return new Tuple3<>(value.split("\\W+")[0], Long.valueOf(value.split("\\W+")[1]), Integer.valueOf(value.split("\\W+")[2])); -
} -
}); -
DataStream<Tuple3<String, Long, Integer>> watermark = -
inputMap.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks<Tuple3<String, Long, Integer>>() { -
private static final long serialVersionUID = 8252616297345284790L; -
Long currentMaxTimestamp = 0L; -
Long maxOutOfOrderness = 2000L;//最大允许的乱序时间是10s -
Watermark watermark = null; -
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"); -
@Nullable -
@Override -
public Watermark getCurrentWatermark() { -
watermark = new Watermark(currentMaxTimestamp - maxOutOfOrderness); -
return watermark; -
} -
@Override -
public long extractTimestamp(Tuple3<String, Long, Integer> element, long previousElementTimestamp) { -
Long timestamp = element.f1; -
currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp); -
System.out.println("timestamp : " + element.f1 + "|" + format.format(element.f1) + " currentMaxTimestamp : " + currentMaxTimestamp + "|" + format.format(currentMaxTimestamp) + "," + " watermark : " + watermark.getTimestamp() + "|" + format.format(watermark.getTimestamp())); -
return timestamp; -
} -
}); -
OutputTag<Tuple3<String, Long, Integer>> lateOutputTag = new OutputTag<Tuple3<String, Long, Integer>>("late-data") { -
private static final long serialVersionUID = -1552769100986888698L; -
}; -
SingleOutputStreamOperator<String> resultStream = watermark -
.keyBy(0) -
.window(TumblingEventTimeWindows.of(Time.seconds(3))) -
.trigger(new Trigger<Tuple3<String, Long, Integer>, TimeWindow>() { -
private static final long serialVersionUID = 2742133264310093792L; -
ValueStateDescriptor<Integer> sumStateDescriptor = new ValueStateDescriptor<Integer>("sum", Integer.class); -
@Override -
public TriggerResult onElement(Tuple3<String, Long, Integer> element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception { -
ValueState<Integer> sumState = ctx.getPartitionedState(sumStateDescriptor); -
if (null == sumState.value()) { -
sumState.update(0); -
} -
sumState.update(element.f2 + sumState.value()); -
if (sumState.value() >= 2) { -
//这里可以选择手动处理状态 -
// 默认的trigger发送是TriggerResult.FIRE 不会清除窗口数据 -
return TriggerResult.FIRE_AND_PURGE; -
} -
return TriggerResult.CONTINUE; -
} -
@Override -
public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception { -
return TriggerResult.CONTINUE; -
} -
@Override -
public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) throws Exception { -
return TriggerResult.CONTINUE; -
} -
@Override -
public void clear(TimeWindow window, TriggerContext ctx) throws Exception { -
System.out.println("清理窗口状态 窗口内保存值为" + ctx.getPartitionedState(sumStateDescriptor).value()); -
ctx.getPartitionedState(sumStateDescriptor).clear(); -
} -
}) -
//如果使用allowedLateness会有重复计算的效果 -
//默认的trigger情况下 -
// 在event time>window_end_time+watermark+allowedLateness时会触发窗口的clear -
// 后续数据如果属于该窗口而且数据的event_time>watermark-allowedLateness 会触发重新计算 -
// -
//在使用自定义的trigger情况下 -
//同一个窗口内只要满足要求可以不停的触发窗口数据往下流 -
//在event time>window_end_time+watermark+allowedLateness时会触发窗口clear -
//后续数据如果属于该窗口而且数据的event_time>watermark-allowedLateness 会触发重新计算 -
// -
//窗口状态的clear只和时间有关与是否自定义trigger无关 -
.allowedLateness(Time.seconds(3)) -
.sideOutputLateData(lateOutputTag) -
.apply(new WindowFunction<Tuple3<String, Long, Integer>, String, Tuple, TimeWindow>() { -
private static final long serialVersionUID = 7813420265419629362L; -
@Override -
public void apply(Tuple tuple, TimeWindow window, Iterable<Tuple3<String, Long, Integer>> input, Collector<String> out) throws Exception { -
for (Tuple3<String, Long, Integer> stringLongTuple2 : input) { -
System.out.println(stringLongTuple2.f1); -
} -
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"); -
out.collect("window " + format.format(window.getStart()) + " window " + format.format(window.getEnd())); -
} -
}); -
resultStream.print(); -
// resultStream.getSideOutput(lateOutputTag).print(); -
env.execute("window test"); -
}
比较了自定义trigger和默认的trigger在event time的前提下,watermark和allowedLateness对trigger的影响。
默认trigger加allowedLateness: 会导致窗口原来的数据也会触发
-
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : -2000|1970-01-01 07:59:58.000 -
timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000 -
timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000 -
timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000 -
timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000 -
1461756862000 -
1461756863000 -
8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000 -
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756864000|2016-04-27 19:34:24.000 -
1461756862000 -
1461756863000 -
1461756862000 -
8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000
自定义trigger加allowedLateness: 会将落后的数据直接往下发送
-
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : -2000|1970-01-01 07:59:58.000 -
timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000 -
1461756862000 -
1461756863000 -
8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000 -
timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000 -
timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000 -
timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000 -
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756864000|2016-04-27 19:34:24.000 -
1461756862000 -
8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000
可以发现两者的不同,默认的trigger会将之前窗口中的数据一起发出,但是自定义的trigger不会将之前的数据发送,而是单独将落后的数据往后发送了。避免数据的重复的问题(trigger发送数据的方式不同)。
默认trigger加allowedLateness: 会导致窗口原来的数据也会触发
-
timestamp : 1461756861000|2016-04-27 19:34:21.000 currentMaxTimestamp : 1461756861000|2016-04-27 19:34:21.000, watermark : -2000|1970-01-01 07:59:58.000 -
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : 1461756859000|2016-04-27 19:34:19.000 -
timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000 -
timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000 -
timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000 -
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756863000|2016-04-27 19:34:23.000 -
1461756861000 -
1461756862000 -
1461756863000 -
1461756862000 -
8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000 -
timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000 -
timestamp : 1461756867000|2016-04-27 19:34:27.000 currentMaxTimestamp : 1461756867000|2016-04-27 19:34:27.000, watermark : 1461756864000|2016-04-27 19:34:24.000 -
timestamp : 1461756868000|2016-04-27 19:34:28.000 currentMaxTimestamp : 1461756868000|2016-04-27 19:34:28.000, watermark : 1461756865000|2016-04-27 19:34:25.000 -
timestamp : 1461756869000|2016-04-27 19:34:29.000 currentMaxTimestamp : 1461756869000|2016-04-27 19:34:29.000, watermark : 1461756866000|2016-04-27 19:34:26.000 -
清理窗口状态 窗口内保存值为4
默认trigger clear()的调用时间: 在29s的时候触发的默认的clear方法,默认执行的类名(EventTimeTrigger)
-
timestamp : 146175682000|1974-08-20 04:21:22.000 currentMaxTimestamp : 146175682000|1974-08-20 04:21:22.000, watermark : -2000|1970-01-01 07:59:58.000 -
timestamp : 146175683000|1974-08-20 04:21:23.000 currentMaxTimestamp : 146175683000|1974-08-20 04:21:23.000, watermark : 146175680000|1974-08-20 04:21:20.000 -
timestamp : 146175684000|1974-08-20 04:21:24.000 currentMaxTimestamp : 146175684000|1974-08-20 04:21:24.000, watermark : 146175681000|1974-08-20 04:21:21.000 -
timestamp : 146175685000|1974-08-20 04:21:25.000 currentMaxTimestamp : 146175685000|1974-08-20 04:21:25.000, watermark : 146175682000|1974-08-20 04:21:22.000 -
timestamp : 146175686000|1974-08-20 04:21:26.000 currentMaxTimestamp : 146175686000|1974-08-20 04:21:26.000, watermark : 146175683000|1974-08-20 04:21:23.000 -
146175682000 -
146175683000 -
8> window 1974-08-20 04:21:21.000 window 1974-08-20 04:21:24.000 -
timestamp : 146175687000|1974-08-20 04:21:27.000 currentMaxTimestamp : 146175687000|1974-08-20 04:21:27.000, watermark : 146175684000|1974-08-20 04:21:24.000 -
timestamp : 146175688000|1974-08-20 04:21:28.000 currentMaxTimestamp : 146175688000|1974-08-20 04:21:28.000, watermark : 146175685000|1974-08-20 04:21:25.000 -
timestamp : 146175689000|1974-08-20 04:21:29.000 currentMaxTimestamp : 146175689000|1974-08-20 04:21:29.000, watermark : 146175686000|1974-08-20 04:21:26.000
默认trigger clear()的调用时间:
-
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : -2000|1970-01-01 07:59:58.000 -
timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000 -
1461756862000 -
1461756863000 -
timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000 -
timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000 -
timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000 -
timestamp : 1461756867000|2016-04-27 19:34:27.000 currentMaxTimestamp : 1461756867000|2016-04-27 19:34:27.000, watermark : 1461756864000|2016-04-27 19:34:24.000 -
timestamp : 1461756868000|2016-04-27 19:34:28.000 currentMaxTimestamp : 1461756868000|2016-04-27 19:34:28.000, watermark : 1461756865000|2016-04-27 19:34:25.000 -
timestamp : 1461756869000|2016-04-27 19:34:29.000 currentMaxTimestamp : 1461756869000|2016-04-27 19:34:29.000, watermark : 1461756866000|2016-04-27 19:34:26.000 -
清理窗口状态 窗口内保存值为2
通过自定义trigger和默认的trigger的比较,可以发现clear()方法的调用只和时间有关
当event time>window_end_time+watermark+allowedLateness时调用
进入event time默认的trigger看看:
-
@PublicEvolving -
public class EventTimeTrigger extends Trigger<Object, TimeWindow> { -
private static final long serialVersionUID = 1L; -
private EventTimeTrigger() {} -
@Override -
public TriggerResult onElement(Object element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception { -
if (window.maxTimestamp() <= ctx.getCurrentWatermark()) { -
// if the watermark is already past the window fire immediately -
return TriggerResult.FIRE; -
} else { -
// 注册一个事件时间的定时器,触发onEventTime -
ctx.registerEventTimeTimer(window.maxTimestamp()); -
return TriggerResult.CONTINUE; -
} -
} -
@Override -
public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) { -
//可以看到当触发onEventTime方法时只是将数据往下发送了 -
return time == window.maxTimestamp() ? -
TriggerResult.FIRE : -
TriggerResult.CONTINUE; -
} -
@Override -
public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception { -
return TriggerResult.CONTINUE; -
} -
@Override -
public void clear(TimeWindow window, TriggerContext ctx) throws Exception { -
//删除事件时间的定时器 -
ctx.deleteEventTimeTimer(window.maxTimestamp()); -
} -
@Override -
public boolean canMerge() { -
return true; -
} -
@Override -
public void onMerge(TimeWindow window, -
OnMergeContext ctx) { -
// only register a timer if the watermark is not yet past the end of the merged window -
// this is in line with the logic in onElement(). If the watermark is past the end of -
// the window onElement() will fire and setting a timer here would fire the window twice. -
long windowMaxTimestamp = window.maxTimestamp(); -
if (windowMaxTimestamp > ctx.getCurrentWatermark()) { -
ctx.registerEventTimeTimer(windowMaxTimestamp); -
} -
} -
@Override -
public String toString() { -
return "EventTimeTrigger()"; -
} -
/** -
* Creates an event-time trigger that fires once the watermark passes the end of the window. -
* -
* <p>Once the trigger fires all elements are discarded. Elements that arrive late immediately -
* trigger window evaluation with just this one element. -
*/ -
public static EventTimeTrigger create() { -
return new EventTimeTrigger(); -
} -
}
到此有些疑问窗口中元素的清除是在什么类中实现的?何时清除的?(自我理解:按理说是应该在调用clear()方法时清除窗口数据,因为此时窗口结束时间已经比watermark-allowedLateness小了)

1115

被折叠的 条评论
为什么被折叠?



