上篇:基于flink的Keyed的CountWindow聚合操作
这四种窗口分别是:
- NoKeyedProcessingTime滚动窗口
- KeyedProcessTime滚动窗口
- NoKeyedEventTime滚动窗口
- KeyedEventTime滚动窗口
1、NoKeyedProcessingTime滚动窗口
不分组,按照ProcessingTime划分为滚动窗口,然后调用reduce对窗口内的数据进行聚合
采用老api:setStreamTimeCharacteristic实现
直接代码:
package cn._51doit.flink.day04;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.*;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
/**
* NoKeyedProcessingTime滚动窗口
* 不分组,按照ProcessingTime划分为滚动窗口,然后调用reduce对窗口内的数据进行聚合
* 老api【setStreamTimeCharacteristic】
*/
public class ProcessingTimeTumblingWindowAllDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);
DataStreamSource<String> lines = env.socketTextStream("Master", 8888);
//将字符串转为数字【lamber表达式】
//本地执行,执行并行度为4,所以调用map返回的DataStream的并行度为4
SingleOutputStreamOperator<Integer> nums = lines.map(Integer::parseInt);
//不分组划分【不keyby】
AllWindowedStream<Integer, TimeWindow> windowed = nums.timeWindowAll(Time.seconds(5));
windowed.sum(0).print();
env.execute();
}
}
控制台打印输出:
由此可见,当时间触发后,才会生成新的窗口,这里的需求是当5秒钟过去后会生成一个新的窗口进行聚合
查看job:http://localhost:8081/#/job/72ca56c8c64cd14ee98b8615f1cf6357/overview
采用新的api:TumblingAlignedProcessingTimeWindows
package cn._51doit.flink.day04;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.*;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingAlignedProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
/**
* NoKeyedProcessingTime滚动窗口
* 不分组,按照ProcessingTime划分为滚动窗口,然后调用reduce对窗口内的数据进行聚合
* 新api【setStreamTimeCharacteristic】
*/
public class ProcessingTimeTumblingWindowAllDemo_02 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
DataStreamSource<String> lines = env.socketTextStream("Master", 8888);
//将字符串转为数字【lamber表达式】
//本地执行,执行并行度为4,所以调用map返回的DataStream的并行度为4
SingleOutputStreamOperator<Integer> nums = lines.map(Integer::parseInt);
//不分组划分【不keyby】
AllWindowedStream<Integer, TimeWindow> windowed = nums.windowAll(TumblingAlignedProcessingTimeWindows.of(Time.seconds(5)));
windowed.sum(0).print();
env.execute();
}
}
2、KeyedProcessTime滚动窗口
先keybey,再划分ProcessingTime的滚动窗口
需求1、:窗口有多个分区,在多个分区里有多个组,组里面有数据就输出【在时间窗口10秒内输出】
第一种方式:调用sum方式实现增量聚合
直接代码
package cn._51doit.flink.day04;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
/**
* KeyedProcessTime滚动窗口--【无界流】
* 先keybey,再划分ProcessingTime的滚动窗口
*/
public class ProcessingTimeTumblingWindowDemo {
public static void main(String [] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
DataStreamSource<String> lines = env.socketTextStream("Master", 8888);
SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndCount = lines.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String value) throws Exception {
String[] fields = value.split(",");
return Tuple2.of(fields[0], Integer.parseInt(fields[1]));
}
});
//调用keyBy
KeyedStream<Tuple2<String, Integer>, String> keyed = wordAndCount.keyBy(t -> t.f0);
//NoKeyed window:不调用keyBy,然后调用windowAll方法,传入windowAssinger
//Keyed window:先调用keyBy,然后调用windowAll方法,传入windowAssinger
WindowedStream<Tuple2<String, Integer>, String, TimeWindow> windowed = keyed.window(TumblingProcessingTimeWindows.of(Time.seconds(10)));
windowed.sum(1).print();
env.execute();
}
}
控制台打印输出:
查看job:http://localhost:8081/#/job/5c6b8e5441faa8bc58409f05d69b9e03/overview
第 2 种方式:调用reduce方式实现增量聚合
直接代码
package cn._51doit.flink.day04;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
/**
* KeyedProcessTime滚动窗口--【无界流】
* 先keybey,再划分ProcessingTime的滚动窗口
* 说明:调用sum或者reduce方法都是增量聚合
*/
public class ProcessingTimeTumblingWindowDemo_01 {
public static void main(String [] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
DataStreamSource<String> lines = env.socketTextStream("Master", 8888);
SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndCount = lines.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String value) throws Exception {
String[] fields = value.split(",");
return Tuple2.of(fields[0], Integer.parseInt(fields[1]));
}
});
//调用keyBy
KeyedStream<Tuple2<String, Integer>, String> keyed = wordAndCount.keyBy(t -> t.f0);
//NoKeyed window:不调用keyBy,然后调用windowAll方法,传入windowAssinger
//Keyed window:先调用keyBy,然后调用windowAll方法,传入windowAssinger
WindowedStream<Tuple2<String, Integer>, String, TimeWindow> windowed = keyed.window(TumblingProcessingTimeWindows.of(Time.seconds(30)));
windowed.reduce(new ReduceFunction<Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> reduce(Tuple2<String, Integer> value1, Tuple2<String, Integer> value2) throws Exception {
value1.f1 = value1.f1 + value2.f1;
return value1;
}
}).print();
env.execute();
}
}
3、NoKeyedEventTime滚动窗口
不分组,按照ProcessingTime划分为滚动窗口,然后调用reduce对窗口内的数据进行聚合
说明:
- 从flink1.2版本之后,assignTimestampsAndWatermarks方法标记过时了
- 如果是1.7~1.10的flink版本,assignTimestampsAndWatermarks方法使用时,为标记过时!
触发公式:
- 当前分区中数据的数据携带的最大的EvenTime - 乱序延迟时间 >=窗口的结束时间就会触发该窗口
代码:
package cn._51doit.flink.day04;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.AllWindowedStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
/**
* NoKeyedEventTime滚动窗口
* 不分组,按照ProcessingTime划分为滚动窗口,然后调用reduce对窗口内的数据进行聚合
*老版本assignTimestampsAndWatermarks方法
*/
public class EventTimeTumblingWindowAllDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
//设置EventTime划分为时间标准
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); //老api
//当前分区中数据的数据携带的最大的EvenTime - 乱序延迟时间 >=窗口的结束时间就会触发该窗口
DataStreamSource<String> lines = env.socketTextStream("Master", 8888);
//提取数据的时间,将时间转成long类型【精确到毫秒】,生成watermark【水位线】
//调用完assignTimestampsAndWatermarks方法后,得到的DataStream中的数据跟原来的是一样的
SingleOutputStreamOperator<String> dataWithWaterMark = lines.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<String>(Time.seconds(0)) { //设置为0,这里表示数据的延迟时间
@Override
public long extractTimestamp(String element) {
//提取数据中的时间
return Long.parseLong(element.split(",")[0]);
}
});
SingleOutputStreamOperator<Integer> nums = dataWithWaterMark.map(new MapFunction<String, Integer>() {
@Override
public Integer map(String value) throws Exception {
return Integer.parseInt(value.split(",")[1]);
}
});
//划分滚动窗口
AllWindowedStream<Integer, TimeWindow> windowed = nums.windowAll(TumblingEventTimeWindows.of(Time.seconds(5)));
//对数据进行聚合
windowed.sum(0).print();
env.execute();
}
}
测试发现,无法触发
查看job:http://localhost:8081/#/job/bb868e633149b42c04c970c1fe58a52b/overview
注意,我们在webui页面查看发现:TriggerWindow的并行度只有一个,所以只会被触发一次,即使满足触发公式都不会被触发,发现所有数据都跑到TriggerWindow执行
4、KeyedEventTime滚动窗口
先keybey,再划分ProcessingTime的滚动窗口
直接代码:老版本
package cn._51doit.flink.day04;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
/**
* KeyedEventTime滚动窗口--【无界流】
* 老版本
*/
public class EventTimeTumblingWindowDemo {
public static void main(String [] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
DataStreamSource<String> lines = env.socketTextStream("Master", 8888);
SingleOutputStreamOperator<String> dataWithWaterMark = lines.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<String>(Time.seconds(0)) {
@Override
public long extractTimestamp(String element) {
return Long.parseLong(element.split(",")[0]);
}
});
SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndCount = dataWithWaterMark.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String value) throws Exception {
String[] fields = value.split(",");
return Tuple2.of(fields[1], Integer.parseInt(fields[2]));
}
});
//调用keyBy
KeyedStream<Tuple2<String, Integer>, String> keyed = wordAndCount.keyBy(t -> t.f0);
//NoKeyed window:不调用keyBy,然后调用windowAll方法,传入windowAssinger
//Keyed window:先调用keyBy,然后调用windowAll方法,传入windowAssinger
WindowedStream<Tuple2<String, Integer>, String, TimeWindow> windowed = keyed.window(TumblingEventTimeWindows.of(Time.seconds(5)));
//调用方式可以调sum方法,也可以调reduce方法
windowed.sum(1).print();
/* windowed.reduce(new ReduceFunction<Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> reduce(Tuple2<String, Integer> value1, Tuple2<String, Integer> value2) throws Exception {
value1.f1 = value1.f1 + value2.f1;
return value1;
}
}).print();*/
env.execute();
}
}
控制台打印输出:
查看job:http://localhost:8081/#/job/ffcde22da27944eb0350afb4034f0fba/overview
我们在webui页面查看发现:
- TriggerWindow的并行度发现有4个分区,所以只要满足触发公式都会被触发
- 由于程序都跑到TriggerWindow里计算,而TriggerWindow有4个并行度,在执行程序后,输入有4条数据都会被触发,统计触发原则,除非一条数据没有下游,否则将按就近原则触发
- 对于迟到的数据会做过标签不会输出
直接代码:新版本api
package cn._51doit.flink.day04;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
public class EventTimeTumblingWindowDemo02 {
public static void main(String [] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
DataStreamSource<String> lines = env.socketTextStream("Master", 8888);
//先对DataStream调用map
SingleOutputStreamOperator<Tuple3<Long,String, Integer>> wordAndCount = lines.map(new MapFunction<String, Tuple3<Long,String, Integer>>() {
@Override
public Tuple3<Long,String, Integer> map(String value) throws Exception {
String[] fields = value.split(",");
return Tuple3.of(Long.parseLong(fields[0]),fields[1], Integer.parseInt(fields[2]));
}
}).setParallelism(2);
//创建一个WaterMark
SingleOutputStreamOperator<Tuple3<Long, String, Integer>> wordAndOneWithWaterMarks = wordAndCount.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<Tuple3<Long, String, Integer>>(Time.seconds(0)) {
@Override
public long extractTimestamp(Tuple3<Long, String, Integer> element) {
//数据提取出来
return element.f0;
}
});
/**
* 调用keyBy【对数据进行keyby】
* project:做投影
*/
KeyedStream<Tuple, Tuple> keyed = wordAndOneWithWaterMarks.project(1, 2).keyBy(0);
//NoKeyed window:不调用keyBy,然后调用windowAll方法,传入windowAssinger
//Keyed window:先调用keyBy,然后调用windowAll方法,传入windowAssinger
WindowedStream<Tuple, Tuple, TimeWindow> windowed = keyed.window(TumblingEventTimeWindows.of(Time.seconds(5)));
//调用方式可以调sum方法,也可以调reduce方法
windowed.sum(1).print();
env.execute();
}
}
查看job:http://localhost:8081/#/job/e79c400d027d1d6c8d3c1e0323123081/overview
发现,不在Source对应的DataStream生成Watermark,而是先调用map生成一个新的DataStream,在这个新的DataStream生成新的WaterMark
谢谢,分享完毕!