【无标题】

flink checkPoint 源码

package org.apache.flink.runtime.jobmaster;
public class JobMaster extends FencedRpcEndpoint<JobMasterId>
        implements JobMasterGateway, JobMasterService {


     private final SchedulerNG schedulerNG;


	  @Override
	    protected void onStart() throws JobMasterException {
	        try {
	            startJobExecution();
	        } catch (Exception e) {
	            final JobMasterException jobMasterException =
	                    new JobMasterException("Could not start the JobMaster.", e);
	            handleJobMasterError(jobMasterException);
	            throw jobMasterException;
	        }
	    }


	     private void startJobExecution() throws Exception {
	        validateRunsInMainThread();

	        JobShuffleContext context = new JobShuffleContextImpl(jobGraph.getJobID(), this);
	        shuffleMaster.registerJob(context);

	        startJobMasterServices();

	        log.info(
	                "Starting execution of job '{}' ({}) under job master id {}.",
	                jobGraph.getName(),
	                jobGraph.getJobID(),
	                getFencingToken());

	        startScheduling();
	    }


	      private void startScheduling() {
		        schedulerNG.startScheduling();
		  }



package org.apache.flink.runtime.scheduler;
public abstract class SchedulerBase implements SchedulerNG, CheckpointScheduling {

  public final void startScheduling() {
        mainThreadExecutor.assertRunningInMainThread();
        registerJobMetrics(
                jobManagerJobMetricGroup,
                executionGraph,
                this::getNumberOfRestarts,
                deploymentStateTimeMetrics,
                executionGraph::registerJobStatusListener,
                executionGraph.getStatusTimestamp(JobStatus.INITIALIZING),
                jobStatusMetricsSettings);
        operatorCoordinatorHandler.startAllOperatorCoordinators();
        startSchedulingInternal();
    }

package org.apache.flink.runtime.scheduler;
public class DefaultScheduler extends SchedulerBase implements SchedulerOperations {

	 @Override
	    protected void startSchedulingInternal() {
	        log.info(
	                "Starting scheduling with scheduling strategy [{}]",
	                schedulingStrategy.getClass().getName());
	        transitionToRunning();
	        schedulingStrategy.startScheduling();
	    }

package org.apache.flink.runtime.executiongraph;
public class DefaultExecutionGraph implements ExecutionGraph, InternalExecutionGraphAccessor {

  @Override
    public void transitionToRunning() {
        if (!transitionState(JobStatus.CREATED, JobStatus.RUNNING)) {
            throw new IllegalStateException(
                    "Job may only be scheduled from state " + JobStatus.CREATED);
        }
    }

org.apache.flink.runtime.executiongraph.DefaultExecutionGraph
public class DefaultExecutionGraph implements ExecutionGraph, InternalExecutionGraphAccessor {

@Override
    public boolean transitionState(JobStatus current, JobStatus newState) {
        return transitionState(current, newState, null);
    }

private boolean transitionState(JobStatus current, JobStatus newState, Throwable error) {
        assertRunningInJobMasterMainThread();
        // consistency check
        if (current.isTerminalState()) {
            String message = "Job is trying to leave terminal state " + current;
            LOG.error(message);
            throw new IllegalStateException(message);
        }

        // now do the actual state transition
        if (state == current) {
            state = newState;
            LOG.info(
                    "Job {} ({}) switched from state {} to {}.",
                    getJobName(),
                    getJobID(),
                    current,
                    newState,
                    error);

            stateTimestamps[newState.ordinal()] = System.currentTimeMillis();
            notifyJobStatusChange(newState);
            return true;
        } else {
            return false;
        }
    }


    private void notifyJobStatusChange(JobStatus newState) {
        if (jobStatusListeners.size() > 0) {
            final long timestamp = System.currentTimeMillis();

            for (JobStatusListener listener : jobStatusListeners) {
                try {
                    listener.jobStatusChanges(getJobID(), newState, timestamp);
                } catch (Throwable t) {
                    LOG.warn("Error while notifying JobStatusListener", t);
                }
            }
        }
    }


package org.apache.flink.runtime.checkpoint.CheckpointCoordinatorDeActivator

     public void jobStatusChanges(JobID jobId, JobStatus newJobStatus, long timestamp) {
        if (newJobStatus == JobStatus.RUNNING) {
            // start the checkpoint scheduler
            coordinator.startCheckpointScheduler();
        } else {
            // anything else should stop the trigger for now
            coordinator.stopCheckpointScheduler();
        }
    }


package org.apache.flink.runtime.checkpoint
	public class CheckpointCoordinator {


	 public void startCheckpointScheduler() {
        synchronized (lock) {
            if (shutdown) {
                throw new IllegalArgumentException("Checkpoint coordinator is shut down");
            }
            Preconditions.checkState(
                    isPeriodicCheckpointingConfigured(),
                    "Can not start checkpoint scheduler, if no periodic checkpointing is configured");

            // make sure all prior timers are cancelled
            stopCheckpointScheduler();

            periodicScheduling = true;
            currentPeriodicTrigger = scheduleTriggerWithDelay(getRandomInitDelay());
        

     private ScheduledFuture<?> scheduleTriggerWithDelay(long initDelay) {
        return timer.scheduleAtFixedRate(
                new ScheduledTrigger(), initDelay, baseInterval, TimeUnit.MILLISECONDS);
    }


       private final class ScheduledTrigger implements Runnable {

	        @Override
	        public void run() {
	            try {
	                triggerCheckpoint(true);
	            } catch (Exception e) {
	                LOG.error("Exception while triggering checkpoint for job {}.", job, e);
	            }
	        }
    	}


    @VisibleForTesting
    public CompletableFuture<CompletedCheckpoint> triggerCheckpoint(
            CheckpointProperties props,
            @Nullable String externalSavepointLocation,
            boolean isPeriodic) {

        CheckpointTriggerRequest request =
                new CheckpointTriggerRequest(props, externalSavepointLocation, isPeriodic);
        chooseRequestToExecute(request).ifPresent(this::startTriggeringCheckpoint);
        return request.onCompletionPromise;
    }

    private void startTriggeringCheckpoint(CheckpointTriggerRequest request) {
        try {
            synchronized (lock) {
                preCheckGlobalState(request.isPeriodic);
            }

            // we will actually trigger this checkpoint!
            Preconditions.checkState(!isTriggering);
            isTriggering = true;

            final long timestamp = System.currentTimeMillis();

            CompletableFuture<CheckpointPlan> checkpointPlanFuture =
                    checkpointPlanCalculator.calculateCheckpointPlan();

            boolean initializeBaseLocations = !baseLocationsForCheckpointInitialized;
            baseLocationsForCheckpointInitialized = true;

            CompletableFuture<Void> masterTriggerCompletionPromise = new CompletableFuture<>();

            final CompletableFuture<PendingCheckpoint> pendingCheckpointCompletableFuture =
                    checkpointPlanFuture
                            .thenApplyAsync(
                                    plan -> {
                                        try {
                                            // this must happen outside the coordinator-wide lock,
                                            // because it communicates with external services
                                            // (in HA mode) and may block for a while.
                                            long checkpointID =
                                                    checkpointIdCounter.getAndIncrement();
                                            return new Tuple2<>(plan, checkpointID);
                                        } catch (Throwable e) {
                                            throw new CompletionException(e);
                                        }
                                    },
                                    executor)
                            .thenApplyAsync(
                                    (checkpointInfo) ->
                                            createPendingCheckpoint(
                                                    timestamp,
                                                    request.props,
                                                    checkpointInfo.f0,
                                                    request.isPeriodic,
                                                    checkpointInfo.f1,
                                                    request.getOnCompletionFuture(),
                                                    masterTriggerCompletionPromise),
                                    timer);

            final CompletableFuture<?> coordinatorCheckpointsComplete =
                    pendingCheckpointCompletableFuture
                            .thenApplyAsync(
                                    pendingCheckpoint -> {
                                        try {
                                            CheckpointStorageLocation checkpointStorageLocation =
                                                    initializeCheckpointLocation(
                                                            pendingCheckpoint.getCheckpointID(),
                                                            request.props,
                                                            request.externalSavepointLocation,
                                                            initializeBaseLocations);
                                            return Tuple2.of(
                                                    pendingCheckpoint, checkpointStorageLocation);
                                        } catch (Throwable e) {
                                            throw new CompletionException(e);
                                        }
                                    },
                                    executor)
                            .thenComposeAsync(
                                    (checkpointInfo) -> {
                                        PendingCheckpoint pendingCheckpoint = checkpointInfo.f0;
                                        if (pendingCheckpoint.isDisposed()) {
                                            // The disposed checkpoint will be handled later,
                                            // skip snapshotting the coordinator states.
                                            return null;
                                        }
                                        synchronized (lock) {
                                            pendingCheckpoint.setCheckpointTargetLocation(
                                                    checkpointInfo.f1);
                                        }
                                        return OperatorCoordinatorCheckpoints
                                                .triggerAndAcknowledgeAllCoordinatorCheckpointsWithCompletion(
                                                        coordinatorsToCheckpoint,
                                                        pendingCheckpoint,
                                                        timer);
                                    },
                                    timer);

            // We have to take the snapshot of the master hooks after the coordinator checkpoints
            // has completed.
            // This is to ensure the tasks are checkpointed after the OperatorCoordinators in case
            // ExternallyInducedSource is used.
            final CompletableFuture<?> masterStatesComplete =
                    coordinatorCheckpointsComplete.thenComposeAsync(
                            ignored -> {
                                // If the code reaches here, the pending checkpoint is guaranteed to
                                // be not null.
                                // We use FutureUtils.getWithoutException() to make compiler happy
                                // with checked
                                // exceptions in the signature.
                                PendingCheckpoint checkpoint =
                                        FutureUtils.getWithoutException(
                                                pendingCheckpointCompletableFuture);
                                if (checkpoint == null || checkpoint.isDisposed()) {
                                    // The disposed checkpoint will be handled later,
                                    // skip snapshotting the master states.
                                    return null;
                                }
                                return snapshotMasterState(checkpoint);
                            },
                            timer);

            FutureUtils.forward(
                    CompletableFuture.allOf(masterStatesComplete, coordinatorCheckpointsComplete),
                    masterTriggerCompletionPromise);

            FutureUtils.assertNoException(
                    masterTriggerCompletionPromise
                            .handleAsync(
                                    (ignored, throwable) -> {
                                        final PendingCheckpoint checkpoint =
                                                FutureUtils.getWithoutException(
                                                        pendingCheckpointCompletableFuture);

                                        Preconditions.checkState(
                                                checkpoint != null || throwable != null,
                                                "Either the pending checkpoint needs to be created or an error must have occurred.");

                                        if (throwable != null) {
                                            // the initialization might not be finished yet
                                            if (checkpoint == null) {
                                                onTriggerFailure(request, throwable);
                                            } else {
                                                onTriggerFailure(checkpoint, throwable);
                                            }
                                        } else {
                                            triggerCheckpointRequest(
                                                    request, timestamp, checkpoint);
                                        }
                                        return null;
                                    },
                                    timer)
                            .exceptionally(
                                    error -> {
                                        if (!isShutdown()) {
                                            throw new CompletionException(error);
                                        } else if (findThrowable(
                                                        error, RejectedExecutionException.class)
                                                .isPresent()) {
                                            LOG.debug("Execution rejected during shutdown");
                                        } else {
                                            LOG.warn("Error encountered during shutdown", error);
                                        }
                                        return null;
                                    }));
        } catch (Throwable throwable) {
            onTriggerFailure(request, throwable);
        }
    }


	 private void triggerCheckpointRequest(
            CheckpointTriggerRequest request, long timestamp, PendingCheckpoint checkpoint) {
        if (checkpoint.isDisposed()) {
            onTriggerFailure(
                    checkpoint,
                    new CheckpointException(
                            CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE,
                            checkpoint.getFailureCause()));
        } else {
            triggerTasks(request, timestamp, checkpoint)
                    .exceptionally(
                            failure -> {
                                LOG.info(
                                        "Triggering Checkpoint {} for job {} failed due to {}",
                                        checkpoint.getCheckpointID(),
                                        job,
                                        failure);

                                final CheckpointException cause;
                                if (failure instanceof CheckpointException) {
                                    cause = (CheckpointException) failure;
                                } else {
                                    cause =
                                            new CheckpointException(
                                                    CheckpointFailureReason
                                                            .TRIGGER_CHECKPOINT_FAILURE,
                                                    failure);
                                }
                                timer.execute(
                                        () -> {
                                            synchronized (lock) {
                                                abortPendingCheckpoint(checkpoint, cause);
                                            }
                                        });
                                return null;
                            });

            coordinatorsToCheckpoint.forEach(
                    (ctx) -> ctx.afterSourceBarrierInjection(checkpoint.getCheckpointID()));
            // It is possible that the tasks has finished
            // checkpointing at this point.
            // So we need to complete this pending checkpoint.
            if (maybeCompleteCheckpoint(checkpoint)) {
                onTriggerSuccess();
            }
        }
    }

    private CompletableFuture<Void> triggerTasks(
            CheckpointTriggerRequest request, long timestamp, PendingCheckpoint checkpoint) {
        // no exception, no discarding, everything is OK
        final long checkpointId = checkpoint.getCheckpointID();

        final SnapshotType type;
        if (this.forceFullSnapshot && !request.props.isSavepoint()) {
            type = CheckpointType.FULL_CHECKPOINT;
        } else {
            type = request.props.getCheckpointType();
        }

        final CheckpointOptions checkpointOptions =
                CheckpointOptions.forConfig(
                        type,
                        checkpoint.getCheckpointStorageLocation().getLocationReference(),
                        isExactlyOnceMode,
                        unalignedCheckpointsEnabled,
                        alignedCheckpointTimeout);

        // send messages to the tasks to trigger their checkpoints
        List<CompletableFuture<Acknowledge>> acks = new ArrayList<>();
        for (Execution execution : checkpoint.getCheckpointPlan().getTasksToTrigger()) {
            if (request.props.isSynchronous()) {
                acks.add(
                        execution.triggerSynchronousSavepoint(
                                checkpointId, timestamp, checkpointOptions));
            } else {
                acks.add(execution.triggerCheckpoint(checkpointId, timestamp, checkpointOptions));
            }
        }
        return FutureUtils.waitForAll(acks);
    }
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值