状态的基础概念
1.State状态
Flink实时计算程序为了保证计算过程中,出现异常可以容错,就要将中间的计算结果数据存储起来,这些中间数据就叫做State
State可以是多种类型的,默认是保存在JobManager的内存中,也可以保存到TaskManager本地文件系统或HDFS这样的分布式文件系统
2.StateBackEnd
用来保存State的存储后端就叫做StateBackEnd,默认是保存在JobManager的内存中,也可以保存在本地文件系统或HDFS这样的分布式文件系统
3.CheckPointing
Flink实时计算为了容错,可以将中间数据定期保存起来,这种定期触发保存中间结果的机制叫CheckPointing. CheckPointing是周期执行的,具体的过程是JobManager定期的向TaskManager中的SubTask发送PRC消息,SubTask将其计算的State保存到StateBackEnd中,并且向JobManager响应CheckPointing是否成功.如果程序出现异常或重启,TaskManager中的SubTask可以从上一次成功的CheckPointing的State恢复
4.重启策略
Flink实时计算程序,为了容错,需要开启CheckPointing,一旦开启CheckPointing,如果没有重启策略,默认的重启策略是无限重启,也可以设置其他重启策略.如:重启固定次数却可以延迟执行的策略
5.CheckPointingMode
exactly-once 精确一次性语义,可以保证数据消费且消费一次,但是要结合对应的数据源,比如Kafka支持exactly-one
at-least-once 至少消费一次,可能会重复消费,但是效率要比exactly-once高
ValueSate
/**
*
* ValueState的底层实现
* Flink中的State分为两种:KeyedState(keyBy之后对应的State),和OperatorState(没有keyBy的State)
*
* ValueState是KeyedState中的一种
*
* 1.KeyedState是KeyedState中的一种
* 2.如果想要容错,必须要开启checkpointing,并且按照Flink的状态API进行编程(将中间结果保存在Flink特殊的变量中)
*
* 使用Flink的ValueState编程API实现可容错的WordCount的功能
*/
public class ValueStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//开启checkpointing checkpointing的默认重启策略是无限重启(Long类型的最大值)
env.enableCheckpointing(10000);
//开启重启策略 可以重启3次 间隔5秒后开始重启
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3,Time.seconds(5)));
//调用Source读取数据
DataStreamSource<String> lines = env.socketTextStream("linux01", 7777);
SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndOne = lines.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String line) throws Exception {
if (line.startsWith("error")){
throw new RuntimeException("输入的数据错误,抛出异常");
}
String[] fields = line.split(" ");
return Tuple2.of(fields[0],1);
}
});
//按照单词keyBy
KeyedStream<Tuple2<String, Integer>, String> keyedStream = wordAndOne.keyBy(tp -> tp.f0);
SingleOutputStreamOperator<Tuple2<String, Integer>> result = keyedStream.map(new RichMapFunction<Tuple2<String, Integer>, Tuple2<String, Integer>>() {
private ValueState<Integer> valueState;
//open方法中初始化或恢复状态
@Override
public void open(Configuration parameters) throws Exception {
//构建状态描述器
ValueStateDescriptor<Integer> stateDescriptor = new ValueStateDescriptor<Integer>("wc_value_state", Types.INT);
//初始化或者恢复状态
valueState = getRuntimeContext().getState(stateDescriptor);
}
@Override
public Tuple2<String, Integer> map(Tuple2<String, Integer> input) throws Exception {
//获取状态中的历史值
Integer history = valueState.value();
Integer current = input.f1;
//如果状态中的历史值为null 说明这个key第一次进入分区
if (history == null){
history = 0;
}
current += history ;
//更新状态
valueState.update(current);
input.f1=current;
return input;
}
});
result.print();
env.execute();
}
}
MapState
/**
*
* 输入如下数据, 将每个省每个城市中的钱进行累加
* 辽宁省,沈阳市,8000
* 辽宁省,大连市,7000
* 辽宁省,鞍山市,6000
* 辽宁省,鞍山市,8000
*
*/
public class MapStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//开启checkpointing
env.enableCheckpointing(10000);
//设置重启策略 错误率重启策略(在一段时间内可也重启指定的次数,如果超过时间范围,重新计数)
env.setRestartStrategy(RestartStrategies.failureRateRestart(3, Time.seconds(30),Time.seconds(5)));
//调用Source读取数据
DataStreamSource<String> lines = env.socketTextStream("linux01", 7777);
//对数据进行整理
//辽宁省,沈阳市,8000
SingleOutputStreamOperator<Tuple3<String, String, Integer>> tpStream = lines.map(new MapFunction<String, Tuple3<String, String, Integer>>() {
@Override
public Tuple3<String, String, Integer> map(String line) throws Exception {
String[] fields = line.split(",");
String province = fields[0];
String city = fields[1];
int money = Integer.parseInt(fields[2]);
return Tuple3.of(province,city,money);
}
});
//按照省份进行keyBy
KeyedStream<Tuple3<String, String, Integer>, String> keyedStream = tpStream.keyBy(tp -> tp.f0);
SingleOutputStreamOperator<Tuple3<String, String, Integer>> result = keyedStream.map(new CityAmountFunction());
result.print();
env.execute();
}
private static class CityAmountFunction extends
RichMapFunction<Tuple3<String, String, Integer>,Tuple3<String, String, Integer>>{
private MapState<String, Integer> mapState;
@Override
public void open(Configuration parameters) throws Exception {
//获取状态描述器
MapStateDescriptor<String, Integer> stateDescriptor =
new MapStateDescriptor<>("cityAmount_state", Types.STRING, Types.INT);
//初始化或恢复状态
mapState = getRuntimeContext().getMapState(stateDescriptor);
}
@Override
public Tuple3<String, String, Integer> map(Tuple3<String, String, Integer> input) throws Exception {
String city = input.f1;
Integer current = input.f2;
Integer history = mapState.get(city);
//如果历史值为null 说明这个city第一次进入分区
if (history == null){
history = 0;
}
current += history;
//更新状态
mapState.put(city,current);
input.f2=current;
return input;
}
}
}
ListState
/**
*
*输入如下数据
* u001,view
* u001,pay
* u001,view
* u001,view
* u001,pay
* u002,view
* 统计每个用户最近5件event
*/
public class ListStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//开启checkpointing
env.enableCheckpointing(10000);
//调用Source读取数据
DataStreamSource<String> lines = env.socketTextStream("linux01", 7777);
//整理数据
SingleOutputStreamOperator<Tuple2<String, String>> tpStream
= lines.map(line -> Tuple2.of(line.split(",")[0], line.split(",")[1]))
.returns(new TypeHint<Tuple2<String, String>>() {});
//按照用户keyBy
KeyedStream<Tuple2<String, String>, String> keyedStream = tpStream.keyBy(tp -> tp.f0);
SingleOutputStreamOperator<Tuple2<String, List<String>>> result = keyedStream.map(new UserEventFunction());
result.print();
env.execute();
}
private static class UserEventFunction extends RichMapFunction<Tuple2<String, String>,Tuple2<String, List<String>>>{
private ListState<String> listState;
@Override
public void open(Configuration parameters) throws Exception {
//创建状态描述器
ListStateDescriptor<String> stateDescriptor = new ListStateDescriptor<>("userEvent_state", Types.STRING);
//初始化或恢复状态
listState = getRuntimeContext().getListState(stateDescriptor);
}
@Override
public Tuple2<String, List<String>> map(Tuple2<String, String> input) throws Exception {
String event = input.f1;
listState.add(event);
ArrayList<String> events = (ArrayList<String>)listState.get();
if (events.size()>5){
events.remove(0);
}
return Tuple2.of(input.f0,events);
}
}
}
简单案例--定义两个状态
/**
*
* user01, activity01, view
* user01,activity01,join
* user01,activity02,view
* user02,activity02,view
* user02,activity02,view
* user03,activity02,view
* user02,activity02,join
* user03,activity01,view
*
*
* 实时统计出各个活动,各种事件的次数和人数(次数出现就累计,人数要按照用户ID去重)
* activity01,view,2,2
* activity01,join,1,1
* activity02,view,4,3
* activity02,join,1,1
*/
public class ActivityCountDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//开启checkpointing
env.enableCheckpointing(10000);
//读取数据
DataStreamSource<String> lines = env.socketTextStream("linux01", 7777);
//对数据进行整理
//user01, activity01, view
SingleOutputStreamOperator<Tuple3<String, String, String>> tpStream
= lines.map(new MapFunction<String, Tuple3<String, String, String>>() {
@Override
public Tuple3<String, String, String> map(String line) throws Exception {
String[] fields = line.split(",");
String uid = fields[0];
String aid = fields[1];
String event = fields[2];
return Tuple3.of(uid, aid, event);
}
});
//按照活动ID和事件进行keyBy
KeyedStream<Tuple3<String, String, String>, Tuple2<String, String>> keyedStream
= tpStream.keyBy(new KeySelector<Tuple3<String, String, String>, Tuple2<String, String>>() {
@Override
public Tuple2<String, String> getKey(Tuple3<String, String, String> tp) throws Exception {
return Tuple2.of(tp.f1, tp.f2);
}
});
SingleOutputStreamOperator<Tuple4<String, String, Integer, Integer>> result = keyedStream.process(new ActivityCountFunction());
result.print();
env.execute();
}
private static class ActivityCountFunction
extends KeyedProcessFunction<Tuple2<String, String>,Tuple3<String, String, String>, Tuple4<String, String, Integer,Integer>> {
private ValueState<Integer> countState;
private ValueState<HashSet<String>> uidState;
@Override
public void open(Configuration parameters) throws Exception {
//获取状态描述器
ValueStateDescriptor<Integer> countStateDescriptor = new ValueStateDescriptor<Integer>("count_state", Types.INT);
ValueStateDescriptor<HashSet<String>> uidStateDescriptor = new ValueStateDescriptor<>("uid_state",
TypeInformation.of(new TypeHint<HashSet<String>>() {}));
//初始化或恢复状态
countState = getRuntimeContext().getState(countStateDescriptor);
uidState = getRuntimeContext().getState(uidStateDescriptor);
}
@Override
public void processElement(Tuple3<String, String, String> input, Context ctx,
Collector<Tuple4<String, String, Integer, Integer>> out) throws Exception {
//统计次数
Integer historyCount = countState.value();
if (historyCount == null){
historyCount = 0;
}
historyCount += 1;
//更新状态
countState.update(historyCount);
//统计人数
HashSet<String> historyUids = uidState.value();
if (historyUids == null){
historyUids =new HashSet<>();
}
historyUids.add(input.f0);
//更新状态
uidState.update(historyUids); //引用类型可以不用更新
//输出
out.collect(Tuple4.of(ctx.getCurrentKey().f0,ctx.getCurrentKey().f1,historyCount,historyUids.size()));
}
}
}