Flink开发笔记二
1. WindowFunction
2. Watermark
2.1. 分布式的Watermark
2.2. DataStream和其他流之间的转换
2.3. Window的抽象概念
2.4. Window Trigger
2.5. Window Evictor
2.6. 1.11版本Watermark写法
val dataStream: DataStream[LoginEvent] = inputStream
.map {
data => {
val arr: Array[String] = data.split(",")
LoginEvent(arr(0).toLong, arr(1), arr(2), arr(3).toLong)
}
}
.assignTimestampsAndWatermarks {
WatermarkStrategy
.forBoundedOutOfOrderness[LoginEvent](Duration.ofSeconds(3))
.withTimestampAssigner(new SerializableTimestampAssigner[LoginEvent] {
override def extractTimestamp(element: LoginEvent, recordTimestamp: Long): Long = element.timestamp * 1000L
})
}
- 注意: scala2.11有点问题,需要配置
参考链接
3. State 状态管理
3.1. 算子状态(operator state)
3.2. 键控状态(keyed state)
3.3. 状态编程
val inputStream: DataStream[String] = env.socketTextStream("192.168.1.27", 9999)
val dataStream: DataStream[SensorReading] = inputStream.map(
data => {
val arr: Array[String] = data.split(",")
SensorReading(arr(0), arr(1).toLong, arr(2).toDouble)
}
)
// 需求:对于传感器温度值跳变,超过“10”度进行报警
val warningStream: DataStream[(String, Double, Double)] = dataStream
.keyBy(_.id)
// 方式1 .flatMap(new TimeChangeAlert(10.0))
// 方式2 (R, S) 有状态的函数(只能在keyBy之后使用) fun: (T, Option[S]) => (TraversableOnce[R], Option[S])
.flatMapWithState[(String, Double, Double), Double] {
case (data: SensorReading, None) => (List.empty, Some(data.temperature))
case (data: SensorReading, lastTemp: Some[Double]) => {
val diff = (data.temperature - lastTemp.get).abs
if (diff > 10.0) {
(List((data.id, lastTemp.get, data.temperature)), Some(data.temperature))
} else {
(List.empty, Some(data.temperature))
}
}
}
warningStream.print()
/**
* 自定义RichFlatMapFunction
*/
class TimeChangeAlert(threshold: Double) extends RichFlatMapFunction[SensorReading, (String, Double, Double)] {
// 定义状态,保存上一次的温度值
lazy val lastTempState: ValueState[Double] = getRuntimeContext.getState(new ValueStateDescriptor[Double]("lastTemp", classOf[Double]))
// 定义开关
lazy val firstState: ValueState[Boolean] = getRuntimeContext.getState(new ValueStateDescriptor[Boolean]("isFirst", classOf[Boolean]))
override def flatMap(value: SensorReading, out: Collector[(String, Double, Double)]): Unit = {
val lastTemp: Double = lastTempState.value()
val noFirst: Boolean = firstState.value()
if (noFirst) {
// 差值比较
val diff = (value.temperature - lastTemp).abs
if (diff > threshold) {
out.collect((value.id, lastTemp, value.temperature))
}
} else {
firstState.update(true)
}
lastTempState.update(value.temperature)
}
}
4. ProcessFunction API(底层 API)
4.1. 如果传感器 10秒内连续上升,则需要报警
package com.xiaofan.apitest.window
import com.xiaofan.apitest.source.SensorReading
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
object ProcessFunctionTest {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val inputStream: DataStream[String] = env.socketTextStream("192.168.1.27", 9999)
val dataStream: DataStream[SensorReading] = inputStream.map(
data => {
val arr: Array[String] = data.split(",")
SensorReading(arr(0), arr(1).toLong, arr(2).toDouble)
}
)
// 需求:如果传感器 10秒内连续上升,则需要报警
val warningStream: DataStream[String] = dataStream
.keyBy(_.id)
.process(new TempIncrementWarning(10000L))
warningStream.print()
env.execute("process function test")
}
}
/**
* 自定义的keyedProcessFunction
*/
class TempIncrementWarning(interval: Long) extends KeyedProcessFunction[String, SensorReading, String] {
// 定义状态,保存上一个温度值进行比较,保存注册定时器的时间戳,用于删除
lazy val lastTempState: ValueState[Double] = getRuntimeContext.getState(new ValueStateDescriptor[Double]("last-temp", classOf[Double]))
lazy val timerTsState: ValueState[Long] = getRuntimeContext.getState(new ValueStateDescriptor[Long]("timer-ts", classOf[Long]))
lazy val noRepeat: ValueState[Boolean] = getRuntimeContext.getState(new ValueStateDescriptor[Boolean]("open-status", classOf[Boolean]))
override def processElement(value: SensorReading, ctx: KeyedProcessFunction[String, SensorReading, String]#Context, out: Collector[String]): Unit = {
// 获取状态
val lastTemp: Double = lastTempState.value()
val timerTs: Long = timerTsState.value()
// 当前温度和上次温度进行比较
if (value.temperature > lastTemp) {
if (noRepeat.value()) {
// 如果温度上升,且没有定时器,那么注册当前时间10s之后的定时器
val ts: Long = ctx.timerService().currentProcessingTime() + interval
ctx.timerService().registerProcessingTimeTimer(ts)
timerTsState.update(ts)
noRepeat.update(false)
}
} else {
// 如果温度下降,那么删除定时器
ctx.timerService().deleteProcessingTimeTimer(timerTs)
timerTsState.clear()
noRepeat.update(true)
}
// 更新温度值
lastTempState.update(value.temperature)
}
override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[String, SensorReading, String]#OnTimerContext, out: Collector[String]): Unit = {
out.collect("传感器 " + ctx.getCurrentKey + "的温度连续" + interval / 1000 + "秒连续上升")
noRepeat.update(true)
}
}
5. 侧输出流
package com.xiaofan.apitest.window
import com.xiaofan.apitest.source.SensorReading
import org.apache.flink.streaming.api.functions.ProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
object SideOutputTest {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val inputStream: DataStream[String] = env.socketTextStream("192.168.1.27", 9999)
val dataStream: DataStream[SensorReading] = inputStream.map(
data => {
val arr: Array[String] = data.split(",")
SensorReading(arr(0), arr(1).toLong, arr(2).toDouble)
}
)
val highTempStream: DataStream[SensorReading] = dataStream.process(new SplitTempProcessor(30.0))
highTempStream.print("high")
highTempStream.getSideOutput(new OutputTag[(String, Long, Double)]("low")).print("low")
env.execute("side output test")
}
}
class SplitTempProcessor(threshold: Double) extends ProcessFunction[SensorReading, SensorReading] {
override def processElement(value: SensorReading, ctx: ProcessFunction[SensorReading, SensorReading]#Context, out: Collector[SensorReading]): Unit = {
if (value.temperature > threshold) {
// 温度大于30度,输出到主流
out.collect(value)
} else {
// 如果不超过30度,输出到侧输出流
ctx.output(new OutputTag[(String, Long, Double)]("low"), (value.id, value.timestamp, value.temperature))
}
}
}
6. Flink的容错机制
6.1. 一致性检查点
6.2. 检查点的实现算法
6.3. 检查点和重启策略配置
// 检查点配置
env.enableCheckpointing(1000)
env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
env.getCheckpointConfig.setCheckpointTimeout(60000L)
env.getCheckpointConfig.setMinPauseBetweenCheckpoints(500L)
env.getCheckpointConfig.setTolerableCheckpointFailureNumber(3)
// 重启策略
// 1. 固定时间间隔重启,10s中内重启3次
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3, 10000L))
// 2. 固定时间间隔内重启多少次,每次间隔多长
env.setRestartStrategy(RestartStrategies.failureRateRestart(5, Time.of(5, TimeUnit.MINUTES), Time.of(10, TimeUnit.SECONDS)))
6.4. 保存点
6.5. 端到端一致性