结论:flink的窗口计算是有状态的计算,并且是自动维护状态,持久化到外部系统
以下是验证过程
import com.mz.iot.utils.DateUtil;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
/**
* 采用奇数偶数求和的案例验证window的state是否自动维护
*/
public class TestWindowState {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.enableCheckpointing(1000, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointTimeout(60000L);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(2);
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(100L);
env.getCheckpointConfig().setPreferCheckpointForRecovery(true);
env.getCheckpointConfig().setTolerableCheckpointFailureNumber(1);
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
env.setStateBackend(new FsStateBackend("hdfs://mz-hadoop-01:8020/ck"));
/**
* 对于使其宕机的数据,重启忽略该数据,就像这个数据从来没有输入过
*/
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(0, 3000L));
/**
* 输入数据格式
* 5,2020-10-01 00:12:08
* 5,2020-10-01 00:13:08
* 5,2020-10-01 00:14:08
*
* 5,2020-10-01 00:15:08
* 5,2020-10-01 00:16:08
* 5,2020-10-01 00:17:08
*
* 5,2020-10-01 00:30:08
*/
DataStreamSource<String> socket = env.socketTextStream("192.168.0.162", 7777);
socket.print("socket stream========>");
SingleOutputStreamOperator<Tuple3<String, Integer, String>> mapStream = socket.map(new MapFunction<String, Tuple3<String, Integer, String>>() {
@Override
public Tuple3<String, Integer, String> map(String value) {
if ("x".equals(value)) {
System.out.println("exit:" + (1 / 0));
}
String[] s = value.split(",");
if (s.length == 2) {
int v = Integer.parseInt(s[0]);
String ts = s[1];
String k = String.valueOf(v % 2);
return Tuple3.of(k, v, ts);
}
return Tuple3.of("null", 0, "9999-99-99 00:00:00");
}
});
mapStream.print("mapStream===>");
SingleOutputStreamOperator<Tuple3<String, Integer, String>> filterStream = mapStream.filter(new FilterFunction<Tuple3<String, Integer, String>>() {
@Override
public boolean filter(Tuple3<String, Integer, String> value) throws Exception {
return !"null".equals(value.f0);
}
}).assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<Tuple3<String, Integer, String>>(Time.seconds(1)) {
@Override
public long extractTimestamp(Tuple3<String, Integer, String> element) {
return DateUtil.getMillsFromString(element.f2);
}
});
filterStream.print("filterStream====>");
KeyedStream<Tuple3<String, Integer, String>, Tuple> keyedStream = filterStream
.keyBy(0);
keyedStream.print("keyed stream======>");
SingleOutputStreamOperator<OddEven> windowResult = keyedStream
.window(TumblingEventTimeWindows.of(Time.minutes(15), Time.minutes(0)))
.aggregate(new AggregateFunction<Tuple3<String, Integer, String>, OddEven, OddEven>() {
@Override
public OddEven createAccumulator() {
return new OddEven();
}
@Override
public OddEven add(Tuple3<String, Integer, String> value, OddEven accumulator) {
accumulator.setSum(accumulator.getSum() + value.f1);
return accumulator;
}
@Override
public OddEven getResult(OddEven accumulator) {
return accumulator;
}
@Override
public OddEven merge(OddEven a, OddEven b) {
return null;
}
},
new ProcessWindowFunction<OddEven, OddEven, Tuple, TimeWindow>() {
@Override
public void process(Tuple tuple, Context context, Iterable<OddEven> elements, Collector<OddEven> out) throws Exception {
TimeWindow window = context.window();
String t_start = DateUtil.getDateStrFromMill(window.getStart());
String t_end = DateUtil.getDateStrFromMill(window.getEnd());
System.out.println("t_start:" + t_start + ",t_end:" + t_end);
OddEven e = elements.iterator().next();
e.setT_start(t_start);
out.collect(e);
}
});
windowResult.print("window result====>");
env.execute("test window state with socket stream");
}
public static class OddEven {
private String t_start;
private int sum;
public OddEven(String t_start, int sum) {
this.t_start = t_start;
this.sum = sum;
}
public OddEven() {
}
@Override
public String toString() {
return "OddEven{" +
"t_start='" + t_start + '\'' +
", sum=" + sum +
'}';
}
public String getT_start() {
return t_start;
}
public void setT_start(String t_start) {
this.t_start = t_start;
}
public int getSum() {
return sum;
}
public void setSum(int sum) {
this.sum = sum;
}
}
}
上面的程序是一个奇数偶数分别求和的案例
首先提交job
分别输入第一个一刻钟内的三条数据:
* 5,2020-10-01 00:12:08
* 5,2020-10-01 00:13:08
* 5,2020-10-01 00:14:08
以及触发窗口计算的数据5,2020-10-01 00:15:08
从taskmanager中查看输出,没问题,该统计结果是15
我们接着输入x,使程序宕机
我们查看宕机时候的checkPoint点
重启程序,并使用上次的ck记录
查看taskmanager的log
接着我们输入如下数据
5,2020-10-01 00:16:08
5,2020-10-01 00:17:08
5,2020-10-01 00:30:08
重启后我们数据的数据和只有10,但是我们看到,15-30分钟时间段内的和flink计算为15,也就是说宕机前的一条数据也进行了计算,验证完毕