5. 状态_aggregatingstate<in, out> reducingstate<t> 的区别-CSDN博客

本文链接：https://blog.csdn.net/stable_zl/article/details/115610803

一、状态是什么

由一个任务维护，并且用来计算某个结果的所有数据，都属于这个任务的状态
可以认为状态就是一个本地变量，可以被任务的业务逻辑访问
Flink 会进行状态管理，包括状态一致性、故障处理以及高效存储和访问，以便开发人员可以专注于应用程序的逻辑

二、状态的两种类型

**KeyedState：**根据数据流中定义的 key 来维护和访问，有如下的数据结构：
- ValueState<T>: 保存一个可以更新和检索的值（如上所述，每个值都对应到当前的输入数据的 key，因此算子接收到的每个 key 都可能对应一个值）。这个值可以通过 update(T) 进行更新，通过 T value() 进行检索。
- ListState<T>: 保存一个元素的列表。可以往这个列表中追加数据，并在当前的列表上进行检索。可以通过 add(T) 或者 addAll(List<T>) 进行添加元素，通过 Iterable<T> get() 获得整个列表。还可以通过 update(List<T>) 覆盖当前的列表。
- ReducingState<T>: 保存一个单值，表示添加到状态的所有值的聚合。接口与 ListState 类似，但使用 add(T) 增加元素，会使用提供的 ReduceFunction 进行聚合。
- AggregatingState<IN, OUT>: 保留一个单值，表示添加到状态的所有值的聚合。和 ReducingState 相反的是, 聚合类型可能与添加到状态的元素的类型不同。接口与 ListState 类似，但使用 add(IN) 添加的元素会用指定的 AggregateFunction 进行聚合。
- MapState<UK, UV>: 维护了一个映射列表。你可以添加键值对到状态中，也可以获得反映当前所有映射的迭代器。使用 put(UK，UV) 或者 putAll(Map<UK，UV>) 添加映射。使用 get(UK) 检索特定 key。使用 entries()，keys() 和 values() 分别检索映射、键和值的可迭代视图。你还可以通过 isEmpty() 来判断是否包含任何键值对。
**OperateState：**算子状态的作用范围限定为算子任务，只有一种数据结构：ListState

三、使用

KeyedState

import org.apache.flink.api.common.functions.RichFlatMapFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor, MapState, MapStateDescriptor, ReducingState, ReducingStateDescriptor, ValueState, ValueStateDescriptor}
import org.apache.flink.api.scala.{createTypeInformation, getCallLocationName}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.util.Collector

object KeyedStateTest extends App {
  val env = StreamExecutionEnvironment.getExecutionEnvironment

  val source = env.fromCollection(List(
    (1L, 3L),
    (1L, 5L),
    (1L, 7L),
    (1L, 4L),
    (1L, 2L),
    (1L, 4L)
  )).keyBy(_._1)
    .flatMap(new CountWindowAverage())
    .print()
  // the printed output will be (1,4), (1,5), (1,3)

  env.execute("ExampleKeyedState")
}

//创建一个类继承自富函数
class CountWindowAverage extends RichFlatMapFunction[(Long, Long), (Long, Long)] {
  //申明状态，在open方法中初始化
  private var sum: ValueState[(Long, Long)] = _

  override def flatMap(value: (Long, Long), out: Collector[(Long, Long)]): Unit = {
    val tmpCurrentSum = sum.value()

    val currentSum = if(tmpCurrentSum != null){
      tmpCurrentSum
    }  else {
      (0L, 0L)
    }

    val newSum = (currentSum._1 + 1, currentSum._2 + value._2)

    sum.update(newSum)

    if(newSum._1 >= 2) {
      out.collect((value._1, newSum._2 / newSum._1))
      sum.clear()
    }
  }

  //初始化状态
  override def open(parameters: Configuration): Unit = {
    sum = getRuntimeContext.getState(
      new ValueStateDescriptor[(Long, Long)]("average", classOf[(Long, Long)])
    )
  }
}

OperateState

import org.apache.flink.api.common.functions.RichMapFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor, ValueStateDescriptor}
import org.apache.flink.api.common.typeinfo.{TypeHint, TypeInformation}
import org.apache.flink.api.scala.createTypeInformation
import org.apache.flink.runtime.state.{FunctionInitializationContext, FunctionSnapshotContext}
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment

import scala.collection.mutable.ListBuffer

object OperateStateTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val source = env.socketTextStream("localhost", 9999)

    val stateStream = source.map( new MyMapState(2) )

    stateStream.print

    env.execute()
  }

}

//创建OperateState要实现CheckpointedFunction接口，并且状态必须是序列化的
class MyMapState(threshold: Int = 1) extends RichMapFunction[String, String] with CheckpointedFunction {

  //申明状态，并申明为可序列化的
  @transient
  private var checkpointedState: ListState[String] = _

  private val bufferedElements = ListBuffer[String]()

  override def map(value: String): String = {
    bufferedElements += value

    if(bufferedElements.size == threshold) {
      val ret = new StringBuilder()

      for(element <- bufferedElements) {
        ret.append(element)
      }

      bufferedElements.clear()
      ret.toString()
    } else {
      "not reach threshold"
    }
  }


  //初始化状态
  override def initializeState(context: FunctionInitializationContext): Unit = {
    val descriptor = new ListStateDescriptor[String](
      "buffered-elements",
      TypeInformation.of(new TypeHint[String]() {})
    )

    checkpointedState = context.getOperatorStateStore.getListState(descriptor)

    //如果使用下面这种方式，创建的是KeyedState
    //chechkpointedState = getRuntimeContext.getListState(new ListStateDescriptor[String]("listState", classOf[String]))

  }

  override def snapshotState(context: FunctionSnapshotContext): Unit = {
    checkpointedState.clear()
    for (elem <- bufferedElements) {
      checkpointedState.add(elem)
    }
  }
}