一、介绍
Flink状态包括:算子状态和按键分区状态,简单理解就是记录任务的中间状态或者数值
二、按键分区状态(Keyed State)
基于 KeyedStream 上的状态。这个状态是跟特定的 key 绑定的,对 KeyedStream 流上的每一个 key,都对应一个 state。
按键分区状态分为:ValueState、ListState、ReducingState、MapState、AggregatingState
2.1、ValueState
即类型为T的单值状态
package com.xx.state;
import com.xx.entity.WaterSensor;
import com.xx.functions.WaterSensorMapFunction;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.time.Duration;
/**
* @author xiaxing
* @describe Flink状态管理
* 算子状态(Keyed State):状态是跟特定的 key 绑定的,对 KeyedStream 流上的每一个 key,都对应一个 state
* ValueState:即类型为T的单值状态
* ListState:即key上的状态值为一个列表
* MapState:状态值为一个 map
* ReducingState:这种状态通过用户传入的 reduceFunction,每次调用 add 方法添加值的时候,会调用 reduceFunction,最后合并到一个单一的状态值
* 按键分区状态(Operator State):与 Key 无关的 State,与 Operator 绑定的 state,整个 operator 只对应一个 state
* @since 2024/3/29 11:10
*/
public class KeyedValueStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<WaterSensor> sensorDS = env.socketTextStream("127.0.0.1", 7777)
.map(new WaterSensorMapFunction())
.assignTimestampsAndWatermarks(
WatermarkStrategy.<WaterSensor>forBoundedOutOfOrderness(Duration.ofSeconds(3))
.withTimestampAssigner((element, ts) -> element.getTs() * 1000L));
// 数值差超过10则告警
sensorDS.keyBy(WaterSensor::getId).process(new KeyedProcessFunction<String, WaterSensor, String>() {
ValueState<Integer> lastVcState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
lastVcState = getRuntimeContext()
.getState(new ValueStateDescriptor<>("lastVcState", Types.INT));
}
@Override
public void processElement(WaterSensor value, KeyedProcessFunction<String, WaterSensor, String>.Context ctx, Collector<String> out) throws Exception {
// 1.取出上一条数据的水位值
Integer lastVc = lastVcState.value() == null ? 0 : lastVcState.value();
// 2.就差值绝对值,判断是否超过10
int abs = Math.abs(value.getVc() - lastVc);
if (abs > 10) {
out.collect("id为:" + value.getId() + ",当前水位值:" + value.getVc() + ",上一条水位值:" + lastVc + ",相差超过10!!!");
}
// 3.保存自身水位值
lastVcState.update(value.getVc());
}
}).print();
env.execute();
}
}
2.2、ListState
即key上的状态值为一个列表
package com.xx.state;
import com.xx.entity.WaterSensor;
import com.xx.functions.WaterSensorMapFunction;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
/**
* @author xiaxing
* @describe Flink状态管理
* 算子状态(Keyed State):状态是跟特定的 key 绑定的,对 KeyedStream 流上的每一个 key,都对应一个 state
* ValueState:即类型为T的单值状态
* ListState:即key上的状态值为一个列表
* MapState:状态值为一个 map
* ReducingState:这种状态通过用户传入的 reduceFunction,每次调用 add 方法添加值的时候,会调用 reduceFunction,最后合并到一个单一的状态值
* 按键分区状态(Operator State):与 Key 无关的 State,与 Operator 绑定的 state,整个 operator 只对应一个 state
* @since 2024/3/29 11:10
*/
public class KeyedListStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<WaterSensor> sensorDS = env.socketTextStream("127.0.0.1", 7777)
.map(new WaterSensorMapFunction())
.assignTimestampsAndWatermarks(
WatermarkStrategy.<WaterSensor>forBoundedOutOfOrderness(Duration.ofSeconds(3))
.withTimestampAssigner((element, ts) -> element.getTs() * 1000L));
// 取最大的三个数值
sensorDS.keyBy(WaterSensor::getId).process(new KeyedProcessFunction<String, WaterSensor, String>() {
ListState<Integer> listState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
listState = getRuntimeContext()
.getListState(new ListStateDescriptor<>("vcListState", Types.INT));
}
@Override
public void processElement(WaterSensor value, KeyedProcessFunction<String, WaterSensor, String>.Context ctx, Collector<String> out) throws Exception {
// 1.写数据
listState.add(value.getVc());
// 2.降序排序
List<Integer> result = new ArrayList<>();
for (Integer vc : listState.get()) {
result.add(vc);
}
result.sort((o1, o2) -> o2 - o1);
// 3.只保留最大的三个
if (result.size() > 3) {
result.remove(3);
}
out.collect("id为:" + value.getId() + ",最大的三个水位值:" + result);
// 4.更新数据
listState.update(result);
}
}).print();
env.execute();
}
}
2.3、MapState
状态值为一个map
package com.xx.state;
import com.xx.entity.WaterSensor;
import com.xx.functions.WaterSensorMapFunction;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.time.Duration;
/**
* @author xiaxing
* @describe Flink状态管理
* 算子状态(Keyed State):状态是跟特定的 key 绑定的,对 KeyedStream 流上的每一个 key,都对应一个 state
* ValueState:即类型为T的单值状态
* ListState:即key上的状态值为一个列表
* MapState:状态值为一个 map
* ReducingState:这种状态通过用户传入的 reduceFunction,每次调用 add 方法添加值的时候,会调用 reduceFunction,最后合并到一个单一的状态值
* 按键分区状态(Operator State):与 Key 无关的 State,与 Operator 绑定的 state,整个 operator 只对应一个 state
* @since 2024/3/29 11:10
*/
public class KeyedMapStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<WaterSensor> sensorDS = env.socketTextStream("127.0.0.1", 7777)
.map(new WaterSensorMapFunction())
.assignTimestampsAndWatermarks(
WatermarkStrategy.<WaterSensor>forBoundedOutOfOrderness(Duration.ofSeconds(3))
.withTimestampAssigner((element, ts) -> element.getTs() * 1000L));
// 统计每个key出现的次数
sensorDS.keyBy(WaterSensor::getId).process(new KeyedProcessFunction<String, WaterSensor, String>() {
MapState<Integer, Integer> mapState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
mapState = getRuntimeContext()
.getMapState(new MapStateDescriptor<>("vcMapState", Types.INT, Types.INT));
}
@Override
public void processElement(WaterSensor value, KeyedProcessFunction<String, WaterSensor, String>.Context ctx, Collector<String> out) throws Exception {
Integer vc = value.getVc();
if (mapState.contains(vc)) {
Integer count = mapState.get(vc);
count ++;
mapState.put(vc, count);
} else {
mapState.put(vc, 1);
}
StringBuilder str = new StringBuilder();
str.append("id为:").append(value.getId());
for (Integer key : mapState.keys()) {
str.append(",key:").append(key).append(",value:").append(mapState.get(key));
}
out.collect(str.toString());
}
}).print();
env.execute();
}
}
2.4、ReducingState
这种状态通过用户传入的 reduceFunction,每次调用 add 方法添加值的时候,会调用 reduceFunction,最后合并到一个单一的状态值
package com.xx.state;
import com.xx.entity.WaterSensor;
import com.xx.functions.WaterSensorMapFunction;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.state.ReducingState;
import org.apache.flink.api.common.state.ReducingStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.time.Duration;
/**
* @author xiaxing
* @describe Flink状态管理
* 算子状态(Keyed State):状态是跟特定的 key 绑定的,对 KeyedStream 流上的每一个 key,都对应一个 state
* ValueState:即类型为T的单值状态
* ListState:即key上的状态值为一个列表
* MapState:状态值为一个 map
* ReducingState:这种状态通过用户传入的 reduceFunction,每次调用 add 方法添加值的时候,会调用 reduceFunction,最后合并到一个单一的状态值
* 按键分区状态(Operator State):与 Key 无关的 State,与 Operator 绑定的 state,整个 operator 只对应一个 state
* @since 2024/3/29 11:10
*/
public class KeyedReducingStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<WaterSensor> sensorDS = env.socketTextStream("127.0.0.1", 7777)
.map(new WaterSensorMapFunction())
.assignTimestampsAndWatermarks(
WatermarkStrategy.<WaterSensor>forBoundedOutOfOrderness(Duration.ofSeconds(3))
.withTimestampAssigner((element, ts) -> element.getTs() * 1000L));
// 累加
sensorDS.keyBy(WaterSensor::getId).process(new KeyedProcessFunction<String, WaterSensor, String>() {
ReducingState<Integer> reducingState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
reducingState = getRuntimeContext()
.getReducingState(new ReducingStateDescriptor<>("vcReduceState", (ReduceFunction<Integer>) Integer::sum, Types.INT));
}
@Override
public void processElement(WaterSensor value, KeyedProcessFunction<String, WaterSensor, String>.Context ctx, Collector<String> out) throws Exception {
reducingState.add(value.getVc());
out.collect("id为:" + value.getId() + ",水位线总和:" + reducingState.get());
}
}).print();
env.execute();
}
}
2.5、AggregatingState
package com.xx.state;
import com.xx.entity.WaterSensor;
import com.xx.functions.WaterSensorMapFunction;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.state.AggregatingState;
import org.apache.flink.api.common.state.AggregatingStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.time.Duration;
/**
* @author xiaxing
* @describe Flink状态管理
* 算子状态(Keyed State):状态是跟特定的 key 绑定的,对 KeyedStream 流上的每一个 key,都对应一个 state
* ValueState:即类型为T的单值状态
* ListState:即key上的状态值为一个列表
* MapState:状态值为一个 map
* ReducingState:这种状态通过用户传入的 reduceFunction,每次调用 add 方法添加值的时候,会调用 reduceFunction,最后合并到一个单一的状态值
* 按键分区状态(Operator State):与 Key 无关的 State,与 Operator 绑定的 state,整个 operator 只对应一个 state
* 状态生存时间(ttl)
* @since 2024/3/29 11:10
*/
public class KeyedAggregatingStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<WaterSensor> sensorDS = env.socketTextStream("127.0.0.1", 7777)
.map(new WaterSensorMapFunction())
.assignTimestampsAndWatermarks(
WatermarkStrategy.<WaterSensor>forBoundedOutOfOrderness(Duration.ofSeconds(3))
.withTimestampAssigner((element, ts) -> element.getTs() * 1000L));
// 累加
sensorDS.keyBy(WaterSensor::getId).process(new KeyedProcessFunction<String, WaterSensor, String>() {
AggregatingState<Integer, Double> AggregatingState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
AggregatingState = getRuntimeContext()
.getAggregatingState(new AggregatingStateDescriptor<>("aggregatingState", new AggregateFunction<Integer, Tuple2<Integer, Integer>, Double>() {
@Override
public Tuple2<Integer, Integer> createAccumulator() {
return Tuple2.of(0, 0);
}
@Override
public Tuple2<Integer, Integer> add(Integer value, Tuple2<Integer, Integer> accumulator) {
return Tuple2.of(accumulator.f0 + value, accumulator.f1 + 1);
}
@Override
public Double getResult(Tuple2<Integer, Integer> accumulator) {
return accumulator.f0 * 1D / accumulator.f1;
}
@Override
public Tuple2<Integer, Integer> merge(Tuple2<Integer, Integer> a, Tuple2<Integer, Integer> b) {
return null;
}
}, Types.TUPLE(Types.INT, Types.INT)));
}
@Override
public void processElement(WaterSensor value, KeyedProcessFunction<String, WaterSensor, String>.Context ctx, Collector<String> out) throws Exception {
AggregatingState.add(value.getVc());
out.collect("id为:" + value.getId() + ",平均水位值:" + AggregatingState.get());
}
}).print();
env.execute();
}
}
2.6、状态生存时间
避免状态数据大量积累浪费资源
package com.xx.state;
import com.xx.entity.WaterSensor;
import com.xx.functions.WaterSensorMapFunction;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.StateTtlConfig;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.time.Duration;
/**
* @author xiaxing
* @describe Flink状态管理
* 算子状态(Keyed State):状态是跟特定的 key 绑定的,对 KeyedStream 流上的每一个 key,都对应一个 state
* ValueState:即类型为T的单值状态
* ListState:即key上的状态值为一个列表
* MapState:状态值为一个 map
* ReducingState:这种状态通过用户传入的 reduceFunction,每次调用 add 方法添加值的时候,会调用 reduceFunction,最后合并到一个单一的状态值
* 按键分区状态(Operator State):与 Key 无关的 State,与 Operator 绑定的 state,整个 operator 只对应一个 state
* @since 2024/3/29 11:10
*/
public class KeyedValueTtlStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<WaterSensor> sensorDS = env.socketTextStream("127.0.0.1", 7777)
.map(new WaterSensorMapFunction())
.assignTimestampsAndWatermarks(
WatermarkStrategy.<WaterSensor>forBoundedOutOfOrderness(Duration.ofSeconds(3))
.withTimestampAssigner((element, ts) -> element.getTs() * 1000L));
// 数值差超过10则告警
sensorDS.keyBy(WaterSensor::getId).process(new KeyedProcessFunction<String, WaterSensor, String>() {
ValueState<Integer> lastVcState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
// 创建ttl config
StateTtlConfig ttlConfig = StateTtlConfig
// 过期时间:5s
.newBuilder(Time.seconds(5))
// 状态更新和写入会刷新过期时间
.setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite)
// 不返回过期的状态值
.setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired)
.build();
// 状态描述其启用ttl
ValueStateDescriptor<Integer> valueState = new ValueStateDescriptor<>("lastVcState", Types.INT);
valueState.enableTimeToLive(ttlConfig);
this.lastVcState = getRuntimeContext().getState(valueState);
}
@Override
public void processElement(WaterSensor value, KeyedProcessFunction<String, WaterSensor, String>.Context ctx, Collector<String> out) throws Exception {
Integer lastVc = lastVcState.value();
out.collect("id为:" + value.getId() + ",状态值:" + lastVc);
lastVcState.update(value.getVc());
}
}).print();
env.execute();
}
}
三、算子状态(Operator State)
与 Key 无关的 State,与 Operator 绑定的 state,整个 operator 只对应一个 state,常用于Source和Sink等与外部系统链接的算子上,实际使用不多。
比如Flink中的Kafka Connector,它会在每个 connector 实例中,保存该实例中消费 topic 的所有(partition, offset)映射
算子状态包括:ListState、Broadcast State
3.1、ListState
package com.xx.state;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* @author xiaxing
* @describe 在map算子中计算数据个数
* @since 2024/3/29 15:34
*/
public class OperatorListStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.socketTextStream("127.0.0.1", 7777)
.map(new MyCountMapFunction()).print();
env.execute();
}
public static class MyCountMapFunction implements MapFunction<String, Long>, CheckpointedFunction {
private Long count = 0L;
private ListState<Long> state;
@Override
public Long map(String value) throws Exception {
return count ++;
}
/**
* 将本地变量拷贝到算子状态中
*/
@Override
public void snapshotState(FunctionSnapshotContext context) throws Exception {
System.out.println("snapshotState...");
// 清空算子状态
state.clear();
// 将本地变量添加到状态算子中
state.add(count);
}
/**
* 初始化本地变量,从状态中,把数据添加到本地变量,每个子任务调用一次
*/
@Override
public void initializeState(FunctionInitializationContext context) throws Exception {
System.out.println("initializeState...");
// 从上下文初始化算子状态
state = context
.getOperatorStateStore()
.getListState(new ListStateDescriptor<>("state", Types.LONG));
// 从算子状态中将数据拷贝到本地变量
if (context.isRestored()) {
for (Long aLong : state.get()) {
count += aLong;
}
}
}
}
}
3.2、Broadcast State
Broadcast State 是 Flink 1.5 引入的新特性。在开发过程中,如果遇到需要下发/广播配置、规则等低吞吐事件流到下游所有 task 时,就可以使用 Broadcast State 特性。下游的 task 接收这些配置、规则并保存为 BroadcastState, 将这些配置应用到另一个数据流的计算中 。
package com.xx.state;
import com.xx.entity.WaterSensor;
import com.xx.functions.WaterSensorMapFunction;
import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.streaming.api.datastream.BroadcastConnectedStream;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction;
import org.apache.flink.util.Collector;
/**
* @author xiaxing
* @describe
* @since 2024/3/29 15:34
*/
public class OperatorBroadcastStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 数据流
SingleOutputStreamOperator<WaterSensor> sensorDS = env.socketTextStream("127.0.0.1", 7777)
.map(new WaterSensorMapFunction());
// 配置流(用于广播配置)
DataStreamSource<String> configSource = env.socketTextStream("127.0.0.1", 8888);
// 将配置流广播
MapStateDescriptor<String, String> broadcastMapState = new MapStateDescriptor<>("broadcast-state", Types.STRING, Types.STRING);
BroadcastStream<String> broadcast = configSource.broadcast(broadcastMapState);
// 将数据流和广播后的配置链接
BroadcastConnectedStream<WaterSensor, String> connect = sensorDS.connect(broadcast);
connect.process(new BroadcastProcessFunction<WaterSensor, String, String>() {
/**
* 数据流处理方法
*/
@Override
public void processElement(WaterSensor value, BroadcastProcessFunction<WaterSensor, String, String>.ReadOnlyContext ctx, Collector<String> out) throws Exception {
// 通过上下文获取广播状态
ReadOnlyBroadcastState<String, String> broadcastState = ctx.getBroadcastState(broadcastMapState);
String config = broadcastState.get("config") == null ? "0" : broadcastState.get("config");
if (Integer.parseInt(config) < value.getVc()) {
out.collect("水位超过指定的预置:" + config + ",当前水位:" + value.getVc());
}
}
/**
* 广播后的配置流处理方法
*/
@Override
public void processBroadcastElement(String value, BroadcastProcessFunction<WaterSensor, String, String>.Context ctx, Collector<String> out) throws Exception {
// 通过上下文获取广播状态
BroadcastState<String, String> broadcastState = ctx.getBroadcastState(broadcastMapState);
broadcastState.put("config", value);
}
}).print();
env.execute();
}
}