Flink 自定义trigger

93 篇文章 9 订阅

 

自定义trigger的主要目的是为了等待数据到齐:

代码如下; flink版本1.6  

 
  1. public class WatermarkTest {

  2. public static void main(String[] args) throws Exception {

  3. StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

  4. env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

  5. Properties properties = new Properties();

  6. properties.setProperty("bootstrap.servers", GlobalConstants.KAFKA_BROKER);

  7. properties.setProperty("group.id", "crm_stream_window");

  8. properties.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");

  9. DataStream<String> stream =

  10. env.addSource(new FlinkKafkaConsumer011<>("test", new SimpleStringSchema(), properties));

  11. DataStream<Tuple3<String, Long, Integer>> inputMap = stream.map(new MapFunction<String, Tuple3<String, Long, Integer>>() {

  12. private static final long serialVersionUID = -8812094804806854937L;

  13.  
  14. @Override

  15. public Tuple3<String, Long, Integer> map(String value) throws Exception {

  16. return new Tuple3<>(value.split("\\W+")[0], Long.valueOf(value.split("\\W+")[1]), Integer.valueOf(value.split("\\W+")[2]));

  17. }

  18. });

  19. DataStream<Tuple3<String, Long, Integer>> watermark =

  20. inputMap.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks<Tuple3<String, Long, Integer>>() {

  21.  
  22. private static final long serialVersionUID = 8252616297345284790L;

  23. Long currentMaxTimestamp = 0L;

  24. Long maxOutOfOrderness = 2000L;//最大允许的乱序时间是10s

  25. Watermark watermark = null;

  26. SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");

  27.  
  28. @Nullable

  29. @Override

  30. public Watermark getCurrentWatermark() {

  31. watermark = new Watermark(currentMaxTimestamp - maxOutOfOrderness);

  32. return watermark;

  33. }

  34.  
  35. @Override

  36. public long extractTimestamp(Tuple3<String, Long, Integer> element, long previousElementTimestamp) {

  37. Long timestamp = element.f1;

  38. currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp);

  39. System.out.println("timestamp : " + element.f1 + "|" + format.format(element.f1) + " currentMaxTimestamp : " + currentMaxTimestamp + "|" + format.format(currentMaxTimestamp) + "," + " watermark : " + watermark.getTimestamp() + "|" + format.format(watermark.getTimestamp()));

  40. return timestamp;

  41. }

  42. });

  43.  
  44. OutputTag<Tuple3<String, Long, Integer>> lateOutputTag = new OutputTag<Tuple3<String, Long, Integer>>("late-data") {

  45. private static final long serialVersionUID = -1552769100986888698L;

  46. };

  47.  
  48. SingleOutputStreamOperator<String> resultStream = watermark

  49. .keyBy(0)

  50. .window(TumblingEventTimeWindows.of(Time.seconds(3)))

  51. .trigger(new Trigger<Tuple3<String, Long, Integer>, TimeWindow>() {

  52. private static final long serialVersionUID = 2742133264310093792L;

  53. ValueStateDescriptor<Integer> sumStateDescriptor = new ValueStateDescriptor<Integer>("sum", Integer.class);

  54.  
  55. @Override

  56. public TriggerResult onElement(Tuple3<String, Long, Integer> element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception {

  57. ValueState<Integer> sumState = ctx.getPartitionedState(sumStateDescriptor);

  58. if (null == sumState.value()) {

  59. sumState.update(0);

  60. }

  61. sumState.update(element.f2 + sumState.value());

  62. if (sumState.value() >= 2) {

  63. //这里可以选择手动处理状态

  64. // 默认的trigger发送是TriggerResult.FIRE 不会清除窗口数据

  65. return TriggerResult.FIRE_AND_PURGE;

  66. }

  67. return TriggerResult.CONTINUE;

  68. }

  69.  
  70. @Override

  71. public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {

  72. return TriggerResult.CONTINUE;

  73. }

  74.  
  75. @Override

  76. public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {

  77. return TriggerResult.CONTINUE;

  78. }

  79.  
  80. @Override

  81. public void clear(TimeWindow window, TriggerContext ctx) throws Exception {

  82. System.out.println("清理窗口状态 窗口内保存值为" + ctx.getPartitionedState(sumStateDescriptor).value());

  83. ctx.getPartitionedState(sumStateDescriptor).clear();

  84. }

  85. })

  86. //如果使用allowedLateness会有重复计算的效果

  87. //默认的trigger情况下

  88. // 在event time>window_end_time+watermark+allowedLateness时会触发窗口的clear

  89. // 后续数据如果属于该窗口而且数据的event_time>watermark-allowedLateness 会触发重新计算

  90. //

  91. //在使用自定义的trigger情况下

  92. //同一个窗口内只要满足要求可以不停的触发窗口数据往下流

  93. //在event time>window_end_time+watermark+allowedLateness时会触发窗口clear

  94. //后续数据如果属于该窗口而且数据的event_time>watermark-allowedLateness 会触发重新计算

  95. //

  96. //窗口状态的clear只和时间有关与是否自定义trigger无关

  97. .allowedLateness(Time.seconds(3))

  98. .sideOutputLateData(lateOutputTag)

  99. .apply(new WindowFunction<Tuple3<String, Long, Integer>, String, Tuple, TimeWindow>() {

  100. private static final long serialVersionUID = 7813420265419629362L;

  101.  
  102. @Override

  103. public void apply(Tuple tuple, TimeWindow window, Iterable<Tuple3<String, Long, Integer>> input, Collector<String> out) throws Exception {

  104. for (Tuple3<String, Long, Integer> stringLongTuple2 : input) {

  105. System.out.println(stringLongTuple2.f1);

  106. }

  107. SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");

  108. out.collect("window " + format.format(window.getStart()) + " window " + format.format(window.getEnd()));

  109. }

  110. });

  111.  
  112. resultStream.print();

  113. // resultStream.getSideOutput(lateOutputTag).print();

  114. env.execute("window test");

  115. }

比较了自定义trigger和默认的trigger在event time的前提下,watermark和allowedLateness对trigger的影响。

默认trigger加allowedLateness: 会导致窗口原来的数据也会触发

 
  1. timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : -2000|1970-01-01 07:59:58.000

  2. timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000

  3. timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000

  4. timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000

  5. timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000

  6. 1461756862000

  7. 1461756863000

  8. 8> window  2016-04-27 19:34:21.000   window  2016-04-27 19:34:24.000

  9. timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756864000|2016-04-27 19:34:24.000

  10. 1461756862000

  11. 1461756863000

  12. 1461756862000

  13. 8> window  2016-04-27 19:34:21.000   window  2016-04-27 19:34:24.000

自定义trigger加allowedLateness: 会将落后的数据直接往下发送

 
  1. timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : -2000|1970-01-01 07:59:58.000

  2. timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000

  3. 1461756862000

  4. 1461756863000

  5. 8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000

  6. timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000

  7. timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000

  8. timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000

  9. timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756864000|2016-04-27 19:34:24.000

  10. 1461756862000

  11. 8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000

可以发现两者的不同,默认的trigger会将之前窗口中的数据一起发出,但是自定义的trigger不会将之前的数据发送,而是单独将落后的数据往后发送了。避免数据的重复的问题(trigger发送数据的方式不同)。

默认trigger加allowedLateness: 会导致窗口原来的数据也会触发

 
  1. timestamp : 1461756861000|2016-04-27 19:34:21.000 currentMaxTimestamp : 1461756861000|2016-04-27 19:34:21.000, watermark : -2000|1970-01-01 07:59:58.000

  2. timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : 1461756859000|2016-04-27 19:34:19.000

  3. timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000

  4. timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000

  5. timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000

  6. timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756863000|2016-04-27 19:34:23.000

  7. 1461756861000

  8. 1461756862000

  9. 1461756863000

  10. 1461756862000

  11. 8> window 2016-04-27 19:34:21.000 window 2016-04-27 19:34:24.000

  12. timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000

  13. timestamp : 1461756867000|2016-04-27 19:34:27.000 currentMaxTimestamp : 1461756867000|2016-04-27 19:34:27.000, watermark : 1461756864000|2016-04-27 19:34:24.000

  14. timestamp : 1461756868000|2016-04-27 19:34:28.000 currentMaxTimestamp : 1461756868000|2016-04-27 19:34:28.000, watermark : 1461756865000|2016-04-27 19:34:25.000

  15. timestamp : 1461756869000|2016-04-27 19:34:29.000 currentMaxTimestamp : 1461756869000|2016-04-27 19:34:29.000, watermark : 1461756866000|2016-04-27 19:34:26.000

  16. 清理窗口状态 窗口内保存值为4

默认trigger clear()的调用时间: 在29s的时候触发的默认的clear方法,默认执行的类名(EventTimeTrigger)

 
  1. timestamp : 146175682000|1974-08-20 04:21:22.000 currentMaxTimestamp : 146175682000|1974-08-20 04:21:22.000, watermark : -2000|1970-01-01 07:59:58.000

  2. timestamp : 146175683000|1974-08-20 04:21:23.000 currentMaxTimestamp : 146175683000|1974-08-20 04:21:23.000, watermark : 146175680000|1974-08-20 04:21:20.000

  3. timestamp : 146175684000|1974-08-20 04:21:24.000 currentMaxTimestamp : 146175684000|1974-08-20 04:21:24.000, watermark : 146175681000|1974-08-20 04:21:21.000

  4. timestamp : 146175685000|1974-08-20 04:21:25.000 currentMaxTimestamp : 146175685000|1974-08-20 04:21:25.000, watermark : 146175682000|1974-08-20 04:21:22.000

  5. timestamp : 146175686000|1974-08-20 04:21:26.000 currentMaxTimestamp : 146175686000|1974-08-20 04:21:26.000, watermark : 146175683000|1974-08-20 04:21:23.000

  6. 146175682000

  7. 146175683000

  8. 8> window 1974-08-20 04:21:21.000 window 1974-08-20 04:21:24.000

  9. timestamp : 146175687000|1974-08-20 04:21:27.000 currentMaxTimestamp : 146175687000|1974-08-20 04:21:27.000, watermark : 146175684000|1974-08-20 04:21:24.000

  10. timestamp : 146175688000|1974-08-20 04:21:28.000 currentMaxTimestamp : 146175688000|1974-08-20 04:21:28.000, watermark : 146175685000|1974-08-20 04:21:25.000

  11. timestamp : 146175689000|1974-08-20 04:21:29.000 currentMaxTimestamp : 146175689000|1974-08-20 04:21:29.000, watermark : 146175686000|1974-08-20 04:21:26.000

默认trigger clear()的调用时间:

 
  1. timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : -2000|1970-01-01 07:59:58.000

  2. timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000

  3. 1461756862000

  4. 1461756863000

  5. timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000

  6. timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000

  7. timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000

  8. timestamp : 1461756867000|2016-04-27 19:34:27.000 currentMaxTimestamp : 1461756867000|2016-04-27 19:34:27.000, watermark : 1461756864000|2016-04-27 19:34:24.000

  9. timestamp : 1461756868000|2016-04-27 19:34:28.000 currentMaxTimestamp : 1461756868000|2016-04-27 19:34:28.000, watermark : 1461756865000|2016-04-27 19:34:25.000

  10. timestamp : 1461756869000|2016-04-27 19:34:29.000 currentMaxTimestamp : 1461756869000|2016-04-27 19:34:29.000, watermark : 1461756866000|2016-04-27 19:34:26.000

  11. 清理窗口状态 窗口内保存值为2

通过自定义trigger和默认的trigger的比较,可以发现clear()方法的调用只和时间有关

当event time>window_end_time+watermark+allowedLateness时调用

进入event time默认的trigger看看:

 
  1. @PublicEvolving

  2. public class EventTimeTrigger extends Trigger<Object, TimeWindow> {

  3. private static final long serialVersionUID = 1L;

  4.  
  5. private EventTimeTrigger() {}

  6.  
  7. @Override

  8. public TriggerResult onElement(Object element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception {

  9. if (window.maxTimestamp() <= ctx.getCurrentWatermark()) {

  10. // if the watermark is already past the window fire immediately

  11. return TriggerResult.FIRE;

  12. } else {

  13. // 注册一个事件时间的定时器,触发onEventTime

  14. ctx.registerEventTimeTimer(window.maxTimestamp());

  15. return TriggerResult.CONTINUE;

  16. }

  17. }

  18.  
  19. @Override

  20. public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) {

  21. //可以看到当触发onEventTime方法时只是将数据往下发送了

  22. return time == window.maxTimestamp() ?

  23. TriggerResult.FIRE :

  24. TriggerResult.CONTINUE;

  25. }

  26.  
  27. @Override

  28. public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {

  29. return TriggerResult.CONTINUE;

  30. }

  31.  
  32. @Override

  33. public void clear(TimeWindow window, TriggerContext ctx) throws Exception {

  34. //删除事件时间的定时器

  35. ctx.deleteEventTimeTimer(window.maxTimestamp());

  36. }

  37.  
  38. @Override

  39. public boolean canMerge() {

  40. return true;

  41. }

  42.  
  43. @Override

  44. public void onMerge(TimeWindow window,

  45. OnMergeContext ctx) {

  46. // only register a timer if the watermark is not yet past the end of the merged window

  47. // this is in line with the logic in onElement(). If the watermark is past the end of

  48. // the window onElement() will fire and setting a timer here would fire the window twice.

  49. long windowMaxTimestamp = window.maxTimestamp();

  50. if (windowMaxTimestamp > ctx.getCurrentWatermark()) {

  51. ctx.registerEventTimeTimer(windowMaxTimestamp);

  52. }

  53. }

  54.  
  55. @Override

  56. public String toString() {

  57. return "EventTimeTrigger()";

  58. }

  59.  
  60. /**

  61. * Creates an event-time trigger that fires once the watermark passes the end of the window.

  62. *

  63. * <p>Once the trigger fires all elements are discarded. Elements that arrive late immediately

  64. * trigger window evaluation with just this one element.

  65. */

  66. public static EventTimeTrigger create() {

  67. return new EventTimeTrigger();

  68. }

  69. }

到此有些疑问窗口中元素的清除是在什么类中实现的?何时清除的?(自我理解:按理说是应该在调用clear()方法时清除窗口数据,因为此时窗口结束时间已经比watermark-allowedLateness小了)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值