WaterMark:
数据会因为网络和背压等原因导致乱序的产生,对于late element 不能一直等待,要有一个机制来保证在一个特定的时间后,必须触发windows去计算,这个机制就是WaterMark
package kb11.window;
import kb11.beans.SensorReading;
import org.apache.commons.collections.IteratorUtils;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;
import org.apache.flink.util.Collector;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import java.util.Iterator;
import java.util.Properties;
/**
* @Author Xulihua
* @Date2021/7/1
* @Description
*/
public class Window4 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 设定时间语义,以事件发生为 时间语义
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
Properties prop = new Properties();
prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.107.103:9092");
prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
prop.put(ConsumerConfig.GROUP_ID_CONFIG, "group_kafka2");
// prop.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
// prop.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
FlinkKafkaConsumer011<String> stringFlinkKafkaConsumer011 = new FlinkKafkaConsumer011<>("sensor", new SimpleStringSchema(), prop);
DataStreamSource<String> inputStream = env.addSource(stringFlinkKafkaConsumer011);
SingleOutputStreamOperator<SensorReading> mapStream = inputStream.map(new MapFunction<String, SensorReading>() {
@Override
public SensorReading map(String value) throws Exception {
String[] split = value.split(",");
return new SensorReading(split[0], Long.parseLong(split[1]), Double.parseDouble(split[2]));
}
}).
assignTimestampsAndWatermarks( // 处理乱序时间
new BoundedOutOfOrdernessTimestampExtractor<SensorReading>(Time.seconds(2)) {
@Override
public long extractTimestamp(SensorReading element) {
return element.getTimestamp() * 1000L;
}
});
KeyedStream<SensorReading, Tuple> keyByStream = mapStream.keyBy("id");
// WindowedStream<SensorReading, Tuple, TimeWindow> timeWindow = keyByStream.timeWindow(Time.seconds(15));
SingleOutputStreamOperator<Tuple4<String, Long, Long, Integer>> applyStream = mapStream.keyBy("id")
.timeWindow(Time.seconds(15))
.apply(new WindowFunction<SensorReading, Tuple4<String, Long, Long, Integer>, Tuple, TimeWindow>() {
@Override
public void apply(Tuple tuple, TimeWindow window, Iterable<SensorReading> input, Collector<Tuple4<String, Long, Long, Integer>> out) throws Exception {
String key = tuple.getField(0); //sensor_1
long start = window.getStart();
long end = window.getEnd();
Iterator<SensorReading> iterator = input.iterator();
int size = IteratorUtils.toList(iterator).size();
Tuple4<String, Long, Long, Integer> returnValue = new Tuple4<>(key, start, end, size);
out.collect(returnValue);
}
});
applyStream.print("apply");
env.execute("job");
}
}
程序中的WaterMark 设置的为 2s
以事件发生EventTime为时间语义
执行以下语句:
>sensor_3,1624864152122,36.3
>sensor_3,1624864152125,35.3
>sensor_3,1624864152130,37.8
>sensor_3,1624864152134,35.5
>sensor_3,1624864152135,38.6
上面代码的WaterMark为0的情况
窗口的大小为15秒,根据公式
timestamp - (timestamp - offset + windowSize) % windowSize
计算得出窗口开启时间为1624864152120
窗口关闭时间为1624864152135
左闭右开 有4条,
符合输出结果
将WaterMark 改为2s
依次输入
>sensor_3,1624864152122,36.3
>sensor_3,1624864152125,35.3
>sensor_3,1624864152130,37.8
>sensor_3,1624864152134,35.5
>sensor_3,1624864152135,38.6
窗口没有输出任何东西
继续输入
>sensor_3,1624864152131,36.7
>sensor_3,1624864152137,35.6
窗口开启时间1624864152120000
窗口关闭时间1624864152135000
数据有5条 比之前多了一条是:
sensor_3,1624864152131,36.7
说明WaterMark生效了
没有WaterMark的话数据就丢失了(当然也可以使用allowedLateness 来保证)