Flink - 基于时间窗口的操作(Flink - 定时器原理)
文章目录
Timer(定时器)是Flink Streaming API提供的用于感知并利用处理时间/事件时间变化的机制
定时器的特性
Timers are registered on a KeyedStream
Timers are automatically deduplicated
Timers are checkpointed by Flink
Timers can be deleted
KeyedProcessFunction
Context*timerService() 需要一个 TimerService
KeyedProcessOperator*open
将获取到的 InternalTimerService(InternalTimerServiceImpl) 包装为 TimerService
InternalTimerService<VoidNamespace> internalTimerService =
getInternalTimerService("user-timers",
VoidNamespaceSerializer.INSTANCE,
this);
TimerService timerService = new SimpleTimerService(internalTimerService);
-----------------------------------------------------------------------------------------
看看如何获取 InternalTimerService(InternalTimerServiceImpl)
AbstractStreamOperator*getInternalTimerService() 方法获取 InternalTimerService
具体实现由 InternalTimeServiceManager 的实现类 InternalTimeServiceManagerImpl 实现
-----------------------------------------------------------------------------------------
InternalTimeServiceManager(InternalTimeServiceManagerImpl)
用于管理各个 InternalTimeService
InternalTimeServiceManagerImpl
//创建优先队列
private final PriorityQueueSetFactory priorityQueueSetFactory;
KeyGroupedInternalPriorityQueue<TimerHeapInternalTimer<K, N>> createTimerPriorityQueue() {
return priorityQueueSetFactory.create(name, timerSerializer);
}
----------------------------
//保存 InternalTimerServiceImpl
private final Map<String, InternalTimerServiceImpl<K, ?>> timerServices;
//创建 InternalTimerService 四要素 name key namespace triggerable
InternalTimeServiceManagerImpl*getInternalTimerService{
InternalTimerServiceImpl<K, N> timerService =
registerOrGetTimerService((name, timerSerializer) ->{
InternalTimerServiceImpl<K, N> timerService =
(InternalTimerServiceImpl<K, N>) timerServices.get(name);
if (timerService == null){
//创建 InternalTimerServiceImpl
timerService =
new InternalTimerServiceImpl<>(
localKeyGroupRange,
keyContext,
processingTimeService,
//创建 TimerHeapInternalTimer 类型的优先队列 具体实现 HeapPriorityQueueSet
// 用来存放 InternalTimerServiceImpl
//按时间戳排序的小顶堆
createTimerPriorityQueue(
“_timer_state/processing_” + name, timerSerializer),
//创建 event time 优先队列 --》 HeapPriorityQueueSet
createTimerPriorityQueue(
“_timer_state/event_” + name, timerSerializer));
将其加入缓存中
timerServices.put(name, timerService);
}
return timerService;
});
}
//注册 Timer 调用的是 SystemProcessingTimeService registerTimer 方法
timerService.startTimerService(
(timerSerializer.getKeySerializer(),
timerSerializer.getNamespaceSerializer(),
triggerable) -> {
//InternalTimeServiceManagerImpl*startTimerService
//注册 Timer
nextTimer = processingTimeService
.registerTimer(headTimer.getTimestamp(), this::onProcessingTime);
}
);
-----------------------------------------------------------------------------------------
SystemProcessingTimeService*registerTimer
//将计时器的触发延迟1毫秒,以便将语义与水印对齐
//Math.max(processingTimestamp - currentTimestamp, 0) + 1 定时器时间-当前时间+1
long delay =ProcessingTimeServiceUtil.getProcessingTimeDelay(
timestamp, getCurrentProcessingTime());
return timerService.schedule(
//其实就是InternalTimerServiceImpl的
//onProcessingTime/advanceWatermark方法
wrapOnTimerCallback(callback, timestamp),
delay,
TimeUnit.MILLISECONDS);
-----------------------------------------------------------------------------------------
InternalTimerServiceImpl*onProcessingTime/advanceWatermark
//从队列中取出 InternalTimerServiceImpl
while ((timer = processingTimeTimersQueue.peek()) != null &&
timer.getTimestamp() <= time) {
processingTimeTimersQueue.poll();
keyContext.setCurrentKey(timer.getKey());
//onProcessingTime()方法被触发回调时,
//就会按顺序从队列中获取到比时间戳time小的所有Timer,
//并挨个执行Triggerable.onProcessingTime()方法
//也就是KeyedProcessOperator*onProcessingTime()
//从而执行 用户自定义的onTimer()逻辑
triggerTarget.onProcessingTime(timer);
}
// processingTimeService实现类SystemProcessingTimeService中的registerTimer方法,
// 该方法中将上次遍历中的最后一个timer的触发时间注册到ScheduledThreadPoolExecutor线程池中,
// 实现再次延迟调用当前 InternalTimerServiceImpl#onProcessingTime,以此实现while逻辑的不断执行
// 即优先级队列的不断遍历
if (timer != null && nextTimer == null) {
nextTimer = processingTimeService.registerTimer(
timer.getTimestamp(), this::onProcessingTime);
}
-----------------------------------------------------------------------------------------
===============================================================
=================================================================
===========================
//注册TimeTimer 实际上是调用InternalTimerServiceImpl*registerProcessingTimeTimer/registerEventTimeTimer
SimpleTimerService*registerProcessingTimeTimer(long time) {
internalTimerService.registerProcessingTimeTimer(VoidNamespace.INSTANCE, time);
}
SimpleTimerService*registerEventTimeTimer(long time) {
internalTimerService.registerEventTimeTimer(VoidNamespace.INSTANCE, time);
}
//注册定时器 InternalTimerServiceImpl*registerProcessingTimeTimer/registerEventTimeTimer
registerProcessingTimeTimer{
// 定时器入队
// 如果if条件满足,则证明入队成功且入队的是小顶堆的堆顶元素,要针对小顶堆堆顶元素创建延迟调用
if (processingTimeTimersQueue.add(
new TimerHeapInternalTimer<>(
time,(K) keyContext.getCurrentKey(), namespace)))
{
long nextTriggerTime = oldHead != null ? oldHead.getTimestamp() : Long.MAX_VALUE;
// check if we need to re-schedule our timer to earlier
if (time < nextTriggerTime) {
if (nextTimer != null) {
nextTimer.cancel(false);
}
nextTimer = processingTimeService
.registerTimer(time, this::onProcessingTime);
}
}
}
HeapPriorityQueueSet*add
{
// 定时器入队
// 获取 keygroup 对应的hashmap,并往其中插入定时器,
// 如果key、namespace、time 均相同,则不让其入队 TimerHeapInternalTimer*equals 方法
// 这里一旦super.add(element)返回true,
// 则表明当前插入的是小顶堆的堆顶元素,需要针对堆顶元素建立延迟调度器
return getDedupMapForElement(element).putIfAbsent(element, element) == null
&& super.add(element);
}
HeapPriorityQueueSet*getDedupMapForElement
{
// keyedstream 在shuffle 时,是将一批key放入到一个key group中,
// 然后根据key group 进行 shuffle 的
// 该方法拿到当前key所在keygroup,一个keygroup维护了一个hashmap,获取该hashmap
int keyGroup =KeyGroupRangeAssignment.assignToKeyGroup(
keyExtractor.extractKeyFromElement(element),
totalNumberOfKeyGroups);
return getDedupMapForKeyGroup(keyGroup);
}
AbstractHeapPriorityQueue*add
{
// 定时器入队
addInternal(toAdd);
// 如果入队后的定时器是堆顶节点,则返回true,
// 后面的逻辑会根据这里是否返回true,
// 来判断是否需要建立ScheduledThreadPoolExecutor延迟调用
// 换言之,延迟调用只会根据堆顶节点来建立
return toAdd.getInternalIndex() == getHeadElementIndex();
}
HeapPriorityQueue*addInternal{
// 定时器入队
final int newSize = increaseSizeByOne();
// 先将定时器插入数组
moveElementToIdx(element, newSize);
// 然后对有序的队列进行siftup操作,以保持小顶堆的特性
// 这里有个有意思的现象,由于小顶堆按照定时器中的时间戳来比大小,
// 而后来的定时器时间必然大于先来的定时器时间,
// 底层数组一直就是一个单调递增的序列。
// 因此,siftup 操作其实并没有做任何调整,天然能保持小顶堆的特性。
siftUp(newSize);
}
public class Win {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<String> streamSource = env
.socketTextStream("hadoop102", 9099);
SingleOutputStreamOperator<Tuple2<String, Long>> flatMapStream = streamSource
.flatMap(
new FlatMapFunction<String, Tuple2<String, Long>>() {
@Override
public void flatMap(String value, Collector<Tuple2<String, Long>> out) throws Exception {
String[] elems = value.split(" ");
out.collect(Tuple2.of(elems[0], Long.parseLong(elems[1]) * 1000));
}
}
);
SingleOutputStreamOperator<Tuple2<String, Long>> withWatermarkStream = flatMapStream
.assignTimestampsAndWatermarks(
WatermarkStrategy
.<Tuple2<String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(0L))
.withTimestampAssigner(
new SerializableTimestampAssigner<Tuple2<String, Long>>() {
@Override
public long extractTimestamp(Tuple2<String, Long> element, long recordTimestamp) {
return element.f1;
}
}
)
);
KeyedStream<Tuple2<String, Long>, String> keyedStream = withWatermarkStream
.keyBy(
new KeySelector<Tuple2<String, Long>, String>() {
@Override
public String getKey(Tuple2<String, Long> value) throws Exception {
return value.f0;
}
}
);
WindowedStream<Tuple2<String, Long>, String, TimeWindow> winStream = keyedStream
.window(SlidingEventTimeWindows.of(Time.seconds(5L), Time.seconds(2L)));
SingleOutputStreamOperator<String> countWinStream = winStream
.process(
new ProcessWindowFunction<Tuple2<String, Long>, String, String, TimeWindow>() {
@Override
public void process(String key, Context context, Iterable<Tuple2<String, Long>> elements, Collector<String> out) throws Exception {
Timestamp winStart = new Timestamp(context.window().getStart());
Timestamp winEnd = new Timestamp(context.window().getEnd());
long count = elements.spliterator().getExactSizeIfKnown();
out.collect("key = " + key + ", win [ " + winStart + " - " + winEnd + " ), 窗口有 " + count + " 条元素,当前的 watermark = " + context.currentWatermark());
}
}
);
countWinStream
.print();
env.execute();
}
}
定时器创建流程
注册定时器 org.apache.flink.streaming.api.windowing.triggers.EventTimeTrigger#onElement
org.apache.flink.streaming.runtime.operators.windowing.WindowOperator.Context#registerEventTimeTimer
org.apache.flink.streaming.api.operators.InternalTimerServiceImpl#registerEventTimeTimer
org.apache.flink.streaming.api.operators.TimerHeapInternalTimer
org.apache.flink.streaming.api.operators.InternalTimer
InternalTimerService 定时器服务的创建过程
在 org.apache.flink.streaming.runtime.operators.windowing.WindowOperator#open方法中调用 getInternalTimerService方法创建 InternalTimerService
org.apache.flink.streaming.api.operators.AbstractStreamOperator#getInternalTimerService
InternalTimerService 可以获取处理时间事件时间,并且可以设置定时器,每一个算子都有自己的 namespace 序列化器,定时器服务可以通过不同的 key 来获取到其序列化器(Timers are registered on a KeyedStream)
InternalTimerService 是由 InternalTimeServiceManager 管理并创建的。
通过 org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl#getInternalTimerService 获取 InternalTimerService
org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl#registerOrGetTimerService
通过 PriorityQueueSetFactory 工厂创建优先队列 org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl#createTimerPriorityQueue
通过 HeapPriorityQueuesManager 管理器获取org.apache.flink.runtime.state.heap.HeapKeyedStateBackend#create
org.apache.flink.runtime.state.HeapPriorityQueuesManager#createOrUpdate
返回 org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl#registerOrGetTimerService 方法中将获取到的 InternalTimerServiceImpl 放入 timerServices map 中
到此我们可以了解到
1、InternalTimerService 是由 InternalTimeServiceManager 管理并创建,管理器默认为 InternalTimeServiceManagerImpl 创建 InternalTimerServiceImpl
2、KeyGroupedInternalPriorityQueue 优先队列由 PriorityQueueSetFactory 工厂创建,由 HeapPriorityQueuesManager 管理器管理
3、默认创建的优先队列为 HeapPriorityQueueSet,并且其被保存在状态中
4、InternalTimerService 和 KeyGroupedInternalPriorityQueue 都分别由一个本地 map 管理
回到 org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl#getInternalTimerService 方法中启动创建好的定时器服务
org.apache.flink.streaming.api.operators.InternalTimerServiceImpl#startTimerService
定时器在 WindowOperator 中的初始化
org.apache.flink.streaming.runtime.tasks.StreamTask#restoreGates
初始化定时器管理器 org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl
org.apache.flink.streaming.runtime.tasks.RegularOperatorChain#initializeStateAndOpenOperators
org.apache.flink.streaming.runtime.tasks.OperatorChain#getAllOperators(boolean)
org.apache.flink.streaming.runtime.tasks.OperatorChain#getAllOperators(boolean)
遍历到 WindowOperator
org.apache.flink.streaming.api.operators.AbstractStreamOperator#initializeState(org.apache.flink.streaming.api.operators.StreamTaskStateInitializer)
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl#streamOperatorStateContext
初始化 Keyed State Backend
org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl#keyedStatedBackend
org.apache.flink.runtime.state.StateBackend#createKeyedStateBackend(org.apache.flink.runtime.execution.Environment, org.apache.flink.api.common.JobID, java.lang.String, org.apache.flink.api.common.typeutils.TypeSerializer, int, org.apache.flink.runtime.state.KeyGroupRange, org.apache.flink.runtime.query.TaskKvStateRegistry, org.apache.flink.runtime.state.ttl.TtlTimeProvider, org.apache.flink.metrics.MetricGroup, java.util.Collection<org.apache.flink.runtime.state.KeyedStateHandle>, org.apache.flink.core.fs.CloseableRegistry, double)
–>
org.apache.flink.runtime.state.hashmap.HashMapStateBackend#createKeyedStateBackend
org.apache.flink.runtime.state.heap.HeapKeyedStateBackendBuilder#build
org.apache.flink.runtime.state.heap.HeapKeyedStateBackend#HeapKeyedStateBackend
回到org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl#streamOperatorStateContext
org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl#create
回到 org.apache.flink.streaming.runtime.tasks.RegularOperatorChain#initializeStateAndOpenOperators 指定 windowOperator 的 open 方法
回到 【InternalTimerService 定时器服务的创建过程】
定时器添加到队列
注册定时器 org.apache.flink.streaming.runtime.operators.windowing.WindowOperator.Context#registerEventTimeTimer
org.apache.flink.streaming.api.operators.InternalTimerServiceImpl#registerEventTimeTimer
org.apache.flink.streaming.api.operators.TimerHeapInternalTimer#TimerHeapInternalTimer
将创建好的 timer 放入优先队列中 org.apache.flink.runtime.state.heap.HeapPriorityQueueSet#add
通过 org.apache.flink.streaming.api.operators.TimerHeapInternalTimer#equals 方法比较是否是同一个 timer
org.apache.flink.runtime.state.heap.AbstractHeapPriorityQueue#add
org.apache.flink.runtime.state.heap.HeapPriorityQueue#addInternal
定时器触发过程
处理时间触发 org.apache.flink.streaming.api.operators.InternalTimerServiceImpl#onProcessingTime
事件时间触发 org.apache.flink.streaming.api.operators.InternalTimerServiceImpl#advanceWatermark
触发定时器 org.apache.flink.streaming.runtime.operators.windowing.WindowOperator#onEventTime
向下游送新的 watermark
org.apache.flink.streaming.api.operators.AbstractStreamOperator#processWatermark(org.apache.flink.streaming.api.watermark.Watermark)