Flink Checkpoint

深入理解 Flink 容错机制 https://blog.csdn.net/yuchuanchen/article/details/102821659
Flink 如何保存状态数据 https://blog.csdn.net/yuchuanchen/article/details/102941569
flink checkpoint 流程分析 https://blog.csdn.net/yuchuanchen/article/details/106123588
flink checkpoint 同步异步过程 https://www.jianshu.com/p/9c587bd491fc
flink checkpoint 存储策略源码分析(同步异步源码) https://blog.csdn.net/yuchuanchen/article/details/106668994
flink state restore 流程源码分析 https://blog.csdn.net/yuchuanchen/article/details/107006015

ck同步异步阶段核心代码

SubtaskCheckpointCoordinatorImpl.checkpointState
/*************************************************
* TODO_MA  ->
*  注释: 拍摄快照
*   将每个 operatorId 对应的 OperatorSnapshotFutures 创建完成
*
*  同步阶段
*  以增量 RocksIncrementalSnapshotStrategy 为例
*    a.对state做深拷贝。
*         takeDBNativeCheckpoint(snapshotDirectory);
*    b.将写操作封装在异步的FutureTask中
*         FutureTask的作用包括:1)打开输入流2)写入状态的元数据信息3)写入状态4)关闭输入流
*         new RocksDBIncrementalSnapshotOperation()
*         RunnableFuture<SnapshotResult<KeyedStateHandle>> snapshotOperation.toAsyncSnapshotFutureTask(cancelStreamRegistry)
*         包装成 Future 返回
*
*/
if(takeSnapshotSync(snapshotFutures, metadata, metrics, options, operatorChain, isCanceled)) {

	/*************************************************
	 * TODO_MA  ->
	 *  注释:如果 同步 Checkpoint 执行成功,AsyncCheckpointRunnable 最后会调用 TaskStateManagerImpl
	 *  的 reportTaskStateSnapshots 方法向 JobManager 发送 AcknowledgeCheckpoint 消息。
	 *
	 * 异步阶段
	 *    a.执行同步阶段创建的FutureTask
	 *       OperatorSnapshotFinalizer finalizedSnapshots = new OperatorSnapshotFinalizer(snapshotInProgress);
	 *         SnapshotResult<KeyedStateHandle> keyedManaged = FutureUtils.runIfNotDoneAndGet(snapshotFutures.getKeyedStateManagedFuture());
	 *           FutureTask.run() ->  call()  ->  AsyncSnapshotCallable.call() ->  RocksDBIncrementalSnapshotOperation.callInternal();
	 *    b.向Checkpoint Coordinator发送ACK响应
	 *        reportCompletedSnapshotStates
	 *
	 */
	finishAndReportAsync(snapshotFutures, metadata, metrics, options);
}

在 ExecutionGraphBuilder 构建 ExecutionGraph 的时候,会生成
CheckpointCoordinatorConfiguration 对象,来保存成 JobGraph 中的 snapshotSettings 参数,最终
该交给ExecutionGraphBuilder.enableCheckpointing();

1、解析 ExecutionGraph 中的各种 ExecutionVertex,设置到 tasksToTrigger,
tasksToWaitFor,tasksToCommitTo 数组中
2、注册了 CheckpointStatsTracker 组件,用来汇总 Checkpoint 的统计信息。
3、创建 CheckpointFailureManager,管理 checkpoint 失败后的策略
4、创建定时器 checkpointCoordinatorTimer(ScheduledExecutorService),用于定时触发
checkpoint
5、创建 CheckpointCoordinator,并注册 CheckpointCoordinatorDeActivator

JobMaster构造方法中
JobMaster.createScheduler() -> DefaultSchedulerFactory.createInstance() -> new DefaultScheduler() 
-> super -> SchedulerBase.createAndRestoreExecutionGraph() -> SchedulerBase.createExecutionGraph() 
-> ExecutionGraphBuilder.buildGraph() -> ExecutionGraph.enableCheckpointing()
	/*************************************************
	 * TODO
	 *  注释: 给 ExecutionGraph 配置 checkpoint 相关参数
	 */
	public void enableCheckpointing(CheckpointCoordinatorConfiguration chkConfig, List<ExecutionJobVertex> verticesToTrigger,
									List<ExecutionJobVertex> verticesToWaitFor, List<ExecutionJobVertex> verticesToCommitTo, List<MasterTriggerRestoreHook<?>> masterHooks,
									CheckpointIDCounter checkpointIDCounter, CompletedCheckpointStore checkpointStore, StateBackend checkpointStateBackend,
									CheckpointStatsTracker statsTracker) {

		/*************************************************
		 * TODO
		 *  注释: 检查状态
		 */
		checkState(state == JobStatus.CREATED, "Job must be in CREATED state");
		checkState(checkpointCoordinator == null, "checkpointing already enabled");

		/*************************************************
		 * TODO
		 *  注释: 需要执行 Checkpoint 相关行为的 ExecutionVertex
		 *  1、tasksToTrigger 对应到 JobGraph 中的: triggerVertices
		 *  2、tasksToWaitFor 对应到 JobGraph 中的: ackVertices
		 *  3、tasksToCommitTo 对应到 JobGraph 中的: commitVertices
		 */
		ExecutionVertex[] tasksToTrigger = collectExecutionVertices(verticesToTrigger);
		ExecutionVertex[] tasksToWaitFor = collectExecutionVertices(verticesToWaitFor);
		ExecutionVertex[] tasksToCommitTo = collectExecutionVertices(verticesToCommitTo);

		// TODO_MA 注释: 构建 OperatorCoordinator 的 CheckpointContext
		final Collection<OperatorCoordinatorCheckpointContext> operatorCoordinators = buildOpCoordinatorCheckpointContexts();

		checkpointStatsTracker = checkNotNull(statsTracker, "CheckpointStatsTracker");

		/*************************************************
		 * TODO
		 *  注释: 创建 CheckpointFailureManager,管理 checkpoint 失败后的策略
		 *  当由于各种原因 checkpoint 失败时,CheckpointFailureManager 负责进行处理,其中有两个重要的参数:
		 *  1、continuousFailureCounter: checkpoint 连续失败的次数,AtomicInteger 类型,确保每个 checkpoint 只被计算一次
		 *  2、tolerableCpFailureNumber: 可以容忍的 checkpoint 失败次数,不指定时,默认为 -1。
		 *     continuousFailureCounter 大于该值时,作业会进入作业级别的失败策略
		 */
		CheckpointFailureManager failureManager = new CheckpointFailureManager(chkConfig.getTolerableCheckpointFailureNumber(),
			new CheckpointFailureManager.FailJobCallback() {
				@Override
				public void failJob(Throwable cause) {
					getJobMasterMainThreadExecutor().execute(() -> failGlobal(cause));
				}

				@Override
				public void failJobDueToTaskFailure(Throwable cause, ExecutionAttemptID failingTask) {
					getJobMasterMainThreadExecutor().execute(() -> failGlobalIfExecutionIsStillRunning(cause, failingTask));
				}
			});

		checkState(checkpointCoordinatorTimer == null);

		/*************************************************
		 * TODO
		 *  注释: 触发 checkpoint 的定时器(线程池)
		 *  创建定时器 checkpointCoordinatorTimer(ScheduledExecutorService),用于定时触发 checkpoint
		 */
		checkpointCoordinatorTimer = Executors
			.newSingleThreadScheduledExecutor(new DispatcherThreadFactory(Thread.currentThread().getThreadGroup(), "Checkpoint Timer"));

		/*************************************************
		 * TODO
		 *  注释: 创建一个 CheckpointCoordinator
		 *  创建 CheckpointCoordinator,并注册 CheckpointCoordinatorDeActivator
		 *  CheckpointCoordinator 是 flink 的一个核心组件,位于 JobManager 进程,用于协调分布式快照和状态的触发与存储
		 *  -
		 *  CheckpointCoordinator 向相关算子(全部 source 算子)发送触发 checkpoint 的消息,
		 *  并收集每个算子上报的快照完成的 ack 消息,这些 ack 消息包含算子进行快照后的状态句柄,
		 *  CheckpointCoordinator 则对这些状态句柄进行维护;待所有算子都上报 ack 消息后,
		 *  CheckpointCoordinator 将这些元数据信息进行保存(根据选择的 StateBackend 保存在不同的位置)
		 */
		// create the coordinator that triggers and commits checkpoints and holds the state
		checkpointCoordinator = new CheckpointCoordinator(jobInformation.getJobId(), chkConfig, tasksToTrigger, tasksToWaitFor, tasksToCommitTo,
			operatorCoordinators, checkpointIDCounter, checkpointStore, checkpointStateBackend, ioExecutor,
			new ScheduledExecutorServiceAdapter(checkpointCoordinatorTimer), SharedStateRegistry.DEFAULT_FACTORY, failureManager);

		// register the master hooks on the checkpoint coordinator
		for(MasterTriggerRestoreHook<?> hook : masterHooks) {
			if(!checkpointCoordinator.addMasterHook(hook)) {
				LOG.warn("Trying to register multiple checkpoint hooks with the name: {}", hook.getIdentifier());
			}
		}

		// TODO_MA 注释: 注册了 CheckpointStatsTracker,用于汇总 checkpoint 统计信息
		checkpointCoordinator.setCheckpointStatsTracker(checkpointStatsTracker);

		// interval of max long value indicates disable periodic checkpoint,
		// the CheckpointActivatorDeactivator should be created only if the interval is not max value
		if(chkConfig.getCheckpointInterval() != Long.MAX_VALUE) {

			/*************************************************
			 * TODO
			 *  注释: 注册一个 Job 状态监听器
			 *  -
			 *  CheckpointCoordinatorDeActivator 实际上是一个监听器,
			 * 	当作业状态转化成 JobStatus.RUNNING 时,CheckpointCoordinator 中的调度器启动。
			 */
			// the periodic checkpoint scheduler is activated and deactivated as a result of
			// job status changes (running -> on, all other states -> off)
			registerJobStatusListener(checkpointCoordinator.createActivatorDeactivator());
		}

		this.stateBackendName = checkpointStateBackend.getClass().getSimpleName();
	}

CheckCoordinator 的触发机制,核心入口是:registerJobStatusListener(checkpointCoordinator.createActivatorDeactivator());
checkpointCoordinator.createActivatorDeactivator() 方法返回的是一个 JobStatusListener,具体实
现是:CheckpointCoordinatorDeActivator,它的作用是:当监听到 Job 的状态为
JobStatus.RUNNING 的时候,就开始执行 CheckpointCoordinatorDeActivator.jobStatusChanges()
的回调处理。而具体的间隔时间,一般都由用户自己设置。

CheckpointCoordinator checkpoint 执行源码详解

CheckpointCoordinator.startCheckpointScheduler();
CheckpointCoordinator.scheduleTriggerWithDelay(getRandomInitDelay());
// 定时调度任务
timer.scheduleAtFixedRate(new ScheduledTrigger(), initDelay,baseInterval, TimeUnit.MILLISECONDS);
ScheduledTrigger.run();
// 任务定时,其实就是定时调用 CheckpointCoordinator 的
triggerCheckpoint() 方法触发 checkpoint
CheckpointCoordinator.triggerCheckpoint(true);
CheckpointCoordinator.triggerCheckpoint(checkpointProperties, null, isPeriodic,false);
CheckpointCoordinator.startTriggeringCheckpoint(CheckpointTriggerRequest){
	CheckpointCoordinator.initializeCheckpoint(...);
	CheckpointCoordinator.createPendingCheckpoint(....);
	CheckpointCoordinator.snapshotTaskState();
}
-> snapshotTaskState -> Execution.triggerCheckpoint(checkpointID,timestamp, checkpointOptions);
  -> Execution.triggerCheckpointHelper(checkpointId, timestamp, checkpointOptions,false);
    -> TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
    -> taskManagerGateway.triggerCheckpoint(.....);
// 发送 Checkpoint RPC 给对应的 Source
-> taskExecutorGateway.triggerCheckpoint(....);

到此为止,JobMaster 终于把 Checkpoint 请求发送给了对应执行了 Source ExecutioinVertexTaskManager 节点了。
在这个过程中,可能会取消 checkpoint:
1、coordinator 处于 shutdown 状态
2、周期性 checkpoint 调度被取消 (periodicScheduling=false),一般 periodicScheduling
= false 时,是因为用户手动触发了 savepoint
3、当前有排队的 checkpoint 请求
4、当前 pendingCheckpoints 数量达到设定的上限
5、与上一次 checkpoint 间隔小于设定的最小值,如果间隔太小,会取消并重新设定调度器
6、如果 job 的所有 Srouce ExecutionVertex 没有全处于 RUNNING 的状态的时候

当 TaskManaager 接收到 checkpoint 请求的时候,TaskManager 端的 checkpoint 分为两种情况:
1SourceStreamTask
2、其他 StreamTaskSourceStreamTask 所在的 TaskExecutor 收到 trigger checkpoint 消息,继续进行 checkpoint,核心入口是:
TaskExecutor.triggerCheckpoint(
 -> task.triggerCheckpointBarrier -> invokable.triggerCheckpointAsync -> super.triggerCheckpointAsync
  -> StreamTask.triggerCheckpointAsync
    内部通过 MailBox 模型来调度执行
	public Future<Boolean> triggerCheckpointAsync(CheckpointMetaData checkpointMetaData, CheckpointOptions checkpointOptions,
												  boolean advanceToEndOfEventTime) {

		CompletableFuture<Boolean> result = new CompletableFuture<>();

		/*************************************************
		 * TODO
		 *  注释: 提交一个 Mail 到 mainMailboxExecutor 中运行
		 *  待执行的 checkpoint 被封装成为 Mail 提交给 mainMailboxExecutor 来执行
		 *  -
		 *  TaskManager 接收到 JobMaster 的 TriggerCheckpoint 消息后,
		 *  经过层层调用最后使用 AbstractInvokable 的 triggerCheckpointAsync 方法来处理。
		 *  AbstractInvokable 是对在 TaskManager 中可执行任务的抽象。
		 *  triggerCheckpointAsync 的具体实现在 AbstractInvokable 的子类 StreamTask 中,
		 *  其核心逻辑就是使用线程池异步调用 triggerCheckpoint 方法。
		 */
		mainMailboxExecutor.execute(() -> {
			latestAsyncCheckpointStartDelayNanos = 1_000_000 * Math.max(0, System.currentTimeMillis() - checkpointMetaData.getTimestamp());
			try {

				/*************************************************
				 * TODO -> triggerCheckpoint
				 *  注释: 执行 Checkpoint
				 */
				result.complete(triggerCheckpoint(checkpointMetaData, checkpointOptions, advanceToEndOfEventTime));

			} catch(Exception ex) {
				// Report the failure both via the Future result but also to the mailbox
				result.completeExceptionally(ex);
				throw ex;
			}
		}, "checkpoint %s with %s", checkpointMetaData, checkpointOptions);
		return result;
	}
	-> triggerCheckpoint
		private boolean triggerCheckpoint(CheckpointMetaData checkpointMetaData, CheckpointOptions checkpointOptions,
									  boolean advanceToEndOfEventTime) throws Exception {
		try {
			// TODO_MA 注释: 如果我们注入检查点,则无法对齐
			// No alignment if we inject a checkpoint
			CheckpointMetrics checkpointMetrics = new CheckpointMetrics().setAlignmentDurationNanos(0L);

			/*************************************************
			 * TODO
			 * 1、执行 SubtaskCheckpointCoordinatorImpl 的初始化
			 *  注释: 执行 Checkpoint 的初始化
			 */
			subtaskCheckpointCoordinator.initCheckpoint(checkpointMetaData.getCheckpointId(), checkpointOptions);

			/*************************************************
			 * TODO
			 *  注释:2、 执行 Checkpoint 的执行, 主要做两件事情:
			 *  1、创建Checkpoint Barrier并向下游节点广播
			 *  2、触发本节点的快照操作
			 */
			boolean success = performCheckpoint(checkpointMetaData, checkpointOptions, checkpointMetrics, advanceToEndOfEventTime);

			// TODO_MA 注释: 如果未成功,则取消本次 checkpoint
			// 3、通过上述的返回值来判断是否要取消 checkpoint
			if(!success) {
				declineCheckpoint(checkpointMetaData.getCheckpointId());
			}
			return success;
		} 
 -> performCheckpoint -> subtaskCheckpointCoordinator.checkpointState(){

		/*************************************************
		 * TODO
		 *  注释: 第一步
		 *  向下游发送 Barrier 前,给当前 Task 的每个 operator 进行逻辑处理的机会。
		 *  这里会调用当前 Task 中所有 operator 的 prepareSnapshotPreBarrier() 方法
		 */
		// Step (1): Prepare the checkpoint, allow operators to do some pre-barrier work.
		//           The pre-barrier work should be nothing or minimal in the common case.
		operatorChain.prepareSnapshotPreBarrier(metadata.getCheckpointId());

		/*************************************************
		 * TODO
		 *  注释: 第二步
		 *  生成 Barrier 并向下游广播 checkpoint Barrier 消息,下游 Task 收到该消息后就开始进行自己的 checkpoint 流程
		 */
		// Step (2): Send the checkpoint barrier downstream
		operatorChain.broadcastEvent(new CheckpointBarrier(metadata.getCheckpointId(), metadata.getTimestamp(), options),
			options.isUnalignedCheckpoint());

		/*************************************************
		 * TODO
		 *  注释: 第三步
		 *  如果是非对齐 checkpoint
		 *  准备溢写 in-flight buffers 为了 input 和 output
		 */
		// Step (3): Prepare to spill the in-flight buffers for input and output
		if(options.isUnalignedCheckpoint()) {
			prepareInflightDataSnapshot(metadata.getCheckpointId());
		}

		/*************************************************
		 * TODO
		 *  注释: 第四步
		 */
		// Step (4): Take the state snapshot. This should be largely asynchronous, to not impact progress of the streaming topology
		Map<OperatorID, OperatorSnapshotFutures> snapshotFutures = new HashMap<>(operatorChain.getNumberOfOperators());
		try {
			/*************************************************
			 * TODO
			 *  注释: 拍摄快照
			 */
			if(takeSnapshotSync(snapshotFutures, metadata, metrics, options, operatorChain, isCanceled)) {

				/*************************************************
				 * TODO
				 *  注释:如果 Checkpoint 执行成功,AsyncCheckpointRunnable 最后会调用 TaskStateManagerImpl
				 *  的 reportTaskStateSnapshots 方法向 JobManager 发送 AcknowledgeCheckpoint 消息。
				 */
				finishAndReportAsync(snapshotFutures, metadata, metrics, options);
			} else {
				cleanup(snapshotFutures, metadata, metrics, new Exception("Checkpoint declined"));
			}
}
 
 takeSnapshotSync-> buildOperatorSnapshotFutures -> checkpointStreamOperator 
  -> op.snapshotState -> stateHandler.snapshotState -> snapshotState
  // 1、对 StreamOperator 完成 snapshot
streamOperator.snapshotState(snapshotContext);
// 2、针对 Operator 类型的状态执行 snapshot
operatorStateBackend.snapshot(checkpointId, timestamp, factory,
checkpointOptions)
// 3、针对 KeyedState 类型的状态执行 snapshot
keyedStateBackend.snapshot(checkpointId, timestamp, factory, checkpointOptions)

Checkpoint CheckCoordinator 端反馈处理

当上述,第四步完成的时候,第五步就可以对 JobMaster 进行 checkpoint 状态汇报了。然后当
TaskExecutor 执行完 checkpoint 之后,发送回反馈,CheckCoordinator 执行处理。
核心入口是:JobMaster.acknowledgeCheckpoint() 方法

 StreamTask.triggerCheckpoint -> performCheckpoint
  -> SubtaskCheckpointCoordinatorImpl.checkpointState
            /*************************************************
			 * TODO ->
			 *  注释: 拍摄快照
			 */
			if(takeSnapshotSync(snapshotFutures, metadata, metrics, options, operatorChain, isCanceled)) {

				/*************************************************
				 * TODO
				 *  注释:如果 Checkpoint 执行成功,AsyncCheckpointRunnable 最后会调用 TaskStateManagerImpl
				 *  的 reportTaskStateSnapshots 方法向 JobManager 发送 AcknowledgeCheckpoint 消息。
				 */
				finishAndReportAsync(snapshotFutures, metadata, metrics, options);
			} 
-> finishAndReportAsync -> AsyncCheckpointRunnable.run 
 -> reportCompletedSnapshotStates -> reportTaskStateSnapshots -> checkpointResponder.acknowledgeCheckpoint
  -> checkpointCoordinatorGateway.acknowledgeCheckpoint(
  ->  JobMaster.acknowledgeCheckpoint();
   ->  SchedulerBase.acknowledgeCheckpoint();
    ->  CheckpointCoordinator.receiveAcknowledgeMessage();
		// 处理 Task 节点返回的 ack 信息
		PendingCheckpoint.acknowledgeTask();
		// 判断该 PendingCheckpoint 该发的和该收到的 ack 是否都已经成功 ack
		if(checkpoint.isFullyAcknowledged()) {
		// TODO_MA 注释: 更改 PendingCheckpoint 为 CompletedCheckpoint
		CheckpointCoordinator.completePendingCheckpoint(checkpoint);
		}

CheckpointCoordinator 收到所有 Task 执行完 snapshot 的ack后 向所有 task 发送 completePendingCheckpoint
-> completePendingCheckpoint -> sendAcknowledgeMessages
	private void sendAcknowledgeMessages(long checkpointId, long timestamp) {
		// commit tasks
		for(ExecutionVertex ev : tasksToCommitTo) {
			Execution ee = ev.getCurrentExecutionAttempt();
			if(ee != null) {

				// TODO_MA 注释: 返回消息
				/**
				 *  向各个节点发送checkpoint完成的消息,此方法很重要
				 *  逐级发送通知到所有的task,StreamTask,再到所有的operator,userFunction
				 *  最后如果userFunction实现了CheckpointListener接口
				 *  逐个调用这些userFunction的notifyCheckpointComplete方法
				 */
				ee.notifyCheckpointComplete(checkpointId, timestamp);
			}
		}

-> taskManagerGateway.notifyCheckpointComplete -> taskExecutorGateway.confirmCheckpoint
 -> task.notifyCheckpointComplete -> invokable.notifyCheckpointCompleteAsync(checkpointID);
  -> notifyCheckpointComplete -> subtaskCheckpointCoordinator.notifyCheckpointComplete
   -> AbstractUdfStreamOperator.notifyCheckpointComplete
   	@Override
	public void notifyCheckpointComplete(long checkpointId) throws Exception {
		super.notifyCheckpointComplete(checkpointId);

		// 最后如果userFunction实现了CheckpointListener接口,
		// 逐个调用这些userFunction的notifyCheckpointComplete方法
		if(userFunction instanceof CheckpointListener) {
			((CheckpointListener) userFunction).notifyCheckpointComplete(checkpointId);
		}
	}

Checkpoint State恢复源码剖析

JobMaster.createScheduler();
DefaultSchedulerFactory.createInstance();
new DefaultScheduler();
SchedulerBase.createAndRestoreExecutionGraph();
SchedulerBase.tryRestoreExecutionGraphFromSavepoint();
CheckpointCoordinator.restoreSavepoint();

到达 CheckpointCoordinator 的 restoreSavepoint() 方法,进入 checkpoint state restore 流程。
所以 Chckpoint State Restore 的核心入口是
CheckpointCoordinator.restoreSavepoint();

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值