flink的checkpoint部分源码阅读二(配置系列)

13 篇文章 0 订阅
9 篇文章 1 订阅

在前面我们从官网上了解了checkpoints的配置,为了加深一下大家的印象,所以我们这次从源码方面看看到底有哪些配置


前言

Flink 中的每个方法或算子都能够是有状态的。 状态化的方法在处理单个元素/事件 的时候存储数据,让状态成为使各个类型的算子更加精细的重要部分。 为了让状态容错,Flink 需要为状态添加 checkpoint(检查点)。Checkpoint 使得 Flink 能够恢复状态和在流中的位置,从而向应用提供和无故障执行时一样的语义。

源码

源码位置:在flink-streaming-java模块下
包名叫 org.apache.flink.streaming.api.environment

废话不多说,直接上源码,关键的地方给大家注释了一下

package org.apache.flink.streaming.api.environment;

import org.apache.flink.annotation.Experimental;
import org.apache.flink.annotation.Public;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.JobStatus;
import org.apache.flink.configuration.ReadableConfig;
import org.apache.flink.core.fs.Path;
import org.apache.flink.runtime.state.CheckpointStorage;
import org.apache.flink.runtime.state.StateBackend;
import org.apache.flink.runtime.state.storage.FileSystemCheckpointStorage;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.util.Preconditions;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.net.URI;
import java.time.Duration;

import static java.util.Objects.requireNonNull;
import static org.apache.flink.runtime.checkpoint.CheckpointFailureManager.UNLIMITED_TOLERABLE_FAILURE_NUMBER;
import static org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.MINIMAL_CHECKPOINT_TIME;
import static org.apache.flink.util.Preconditions.checkNotNull;

/**  捕获所有检查点相关设置的配置。*/
@Public
public class CheckpointConfig implements java.io.Serializable {

    private static final long serialVersionUID = -750378776078908147L;

    private static final Logger LOG = LoggerFactory.getLogger(CheckpointConfig.class);

    /** 默认检查点模式:精确一次*/
    public static final CheckpointingMode DEFAULT_MODE = CheckpointingMode.EXACTLY_ONCE;

    /** 检查点尝试的默认超时时间:10 分钟*/
    public static final long DEFAULT_TIMEOUT = 10 * 60 * 1000;

    /** 检查点之间的默认最小暂停:无。 */
    public static final long DEFAULT_MIN_PAUSE_BETWEEN_CHECKPOINTS = 0;

    /** 同时发生的检查点的默认限制:1 */
    public static final int DEFAULT_MAX_CONCURRENT_CHECKPOINTS = 1;

    public static final int UNDEFINED_TOLERABLE_CHECKPOINT_NUMBER = -1;

    /** 检查点的默认 ID,在恢复时应忽略其正在进行的数据*/
    public static final int DEFAULT_CHECKPOINT_ID_OF_IGNORED_IN_FLIGHT_DATA = -1;

    // ------------------------------------------------------------------------

    /** Checkpointing mode (exactly-once vs. at-least-once). */
    private CheckpointingMode checkpointingMode = DEFAULT_MODE;

    /**  周期性检查点触发间隔*/
    private long checkpointInterval = -1; // disabled

    /**  检查点在被丢弃之前可能需要的最长时间*/
    private long checkpointTimeout = DEFAULT_TIMEOUT;

    /** 检查点尝试之间的最小暂停。*/
    private long minPauseBetweenCheckpoints = DEFAULT_MIN_PAUSE_BETWEEN_CHECKPOINTS;

    /**  同时进行的最大检查点尝试次数。 */
    private int maxConcurrentCheckpoints = DEFAULT_MAX_CONCURRENT_CHECKPOINTS;

    /** 在迭代作业中强制执行检查点的标志。 */
    private boolean forceCheckpointing;

    /** 在迭代作业中强制执行检查点的标志*/
    private boolean forceUnalignedCheckpoints;

    /** 标记以启用未对齐的检查点。*/
    private boolean unalignedCheckpointsEnabled;

    /**  检查点的 ID,在恢复时应忽略其正在进行的数据。*/
    private long checkpointIdOfIgnoredInFlightData =
            DEFAULT_CHECKPOINT_ID_OF_IGNORED_IN_FLIGHT_DATA;

    /** 从检查点开始到 AC 切换到 UC 的延迟。*/
    private Duration alignedCheckpointTimeout =
            ExecutionCheckpointingOptions.ALIGNED_CHECKPOINT_TIMEOUT.defaultValue();

    /**  标记以启用近似本地恢复 */
    private boolean approximateLocalRecovery;

    /**  持久检查点的清理行为*/
    private ExternalizedCheckpointCleanup externalizedCheckpointCleanup;

    /**
     * Task would not fail if there is an error in their checkpointing.
     *如果检查点出现错误,任务不会失败。
     * <p>{@link #tolerableCheckpointFailureNumber} would always overrule this deprecated field if
     * they have conflicts.
     *
     * @deprecated Use {@link #tolerableCheckpointFailureNumber}.
     */
    @Deprecated private boolean failOnCheckpointingErrors = true;

    /** Determines if a job will fallback to checkpoint when there is a more recent savepoint.
     * 确定当有更新的保存点时作业是否将回退到检查点* */
    private boolean preferCheckpointForRecovery = false;

    /**
     * Determines the threshold that we tolerance declined checkpoint failure number. The default
     * value is -1 meaning undetermined and not set via {@link
     * #setTolerableCheckpointFailureNumber(int)}.
     *
     * 确定我们容忍拒绝检查点失败次数的阈值。默认值为 -1 表示未确定且未通过
     * {@link #setTolerableCheckpointFailureNumber(int)} 设置。
     */
    private int tolerableCheckpointFailureNumber = UNDEFINED_TOLERABLE_CHECKPOINT_NUMBER;

    /**
     * The checkpoint storage for this application. This field is marked as transient because it may
     * contain user-code.
     * 此应用程序的检查点存储。该字段被标记为瞬态,因为它可能包含用户代码。
     */
    private transient CheckpointStorage storage;

    /**
     * Creates a deep copy of the provided {@link CheckpointConfig}.
     *创建提供的 {@link CheckpointConfig} 的深层副本
     * @param checkpointConfig the config to copy.
     */
    public CheckpointConfig(final CheckpointConfig checkpointConfig) {
        checkNotNull(checkpointConfig);

        this.checkpointInterval = checkpointConfig.checkpointInterval;
        this.checkpointingMode = checkpointConfig.checkpointingMode;
        this.checkpointTimeout = checkpointConfig.checkpointTimeout;
        this.maxConcurrentCheckpoints = checkpointConfig.maxConcurrentCheckpoints;
        this.minPauseBetweenCheckpoints = checkpointConfig.minPauseBetweenCheckpoints;
        this.preferCheckpointForRecovery = checkpointConfig.preferCheckpointForRecovery;
        this.tolerableCheckpointFailureNumber = checkpointConfig.tolerableCheckpointFailureNumber;
        this.unalignedCheckpointsEnabled = checkpointConfig.isUnalignedCheckpointsEnabled();
        this.alignedCheckpointTimeout = checkpointConfig.alignedCheckpointTimeout;
        this.approximateLocalRecovery = checkpointConfig.isApproximateLocalRecoveryEnabled();
        this.externalizedCheckpointCleanup = checkpointConfig.externalizedCheckpointCleanup;
        this.forceCheckpointing = checkpointConfig.forceCheckpointing;
        this.forceUnalignedCheckpoints = checkpointConfig.forceUnalignedCheckpoints;
        this.storage = checkpointConfig.getCheckpointStorage();
        this.checkpointIdOfIgnoredInFlightData =
                checkpointConfig.getCheckpointIdOfIgnoredInFlightData();
    }

    public CheckpointConfig() {}

    // ------------------------------------------------------------------------

    /** Disables checkpointing.
     * 禁用检查点。*/
    public void disableCheckpointing() {
        this.checkpointInterval = -1;
    }

    /**
     * Checks whether checkpointing is enabled.
     * 检查是否启用了检查点。
     * @return True if checkpointing is enables, false otherwise.
     */
    public boolean isCheckpointingEnabled() {
        return checkpointInterval > 0;
    }

    /**
     * Gets the checkpointing mode (exactly-once vs. at-least-once).
     * 获取检查点模式(恰好一次与至少一次)。
     * @return The checkpointing mode.
     */
    public CheckpointingMode getCheckpointingMode() {
        return checkpointingMode;
    }

    /**
     * Sets the checkpointing mode (exactly-once vs. at-least-once).
     * 设置检查点模式(恰好一次与至少一次)。
     * @param checkpointingMode The checkpointing mode.
     */
    public void setCheckpointingMode(CheckpointingMode checkpointingMode) {
        this.checkpointingMode = requireNonNull(checkpointingMode);
    }

    /**
     * Gets the interval in which checkpoints are periodically scheduled.
     * 获取定期安排检查点的时间间隔。
     * <p>This setting defines the base interval. Checkpoint triggering may be delayed by the
     * settings {@link #getMaxConcurrentCheckpoints()} and {@link #getMinPauseBetweenCheckpoints()}.
     * 此设置定义基本间隔。检查点触发可能会因设置 {@link #getMaxConcurrentCheckpoints()}
     * 和 {@link #getMinPauseBetweenCheckpoints()} 而延迟。
     * @return The checkpoint interval, in milliseconds.
     */
    public long getCheckpointInterval() {
        return checkpointInterval;
    }

    /**
     * Sets the interval in which checkpoints are periodically scheduled.
     * 设置定期安排检查点的时间间隔。
     *
     * <p>This setting defines the base interval. Checkpoint triggering may be delayed by the
     * settings {@link #setMaxConcurrentCheckpoints(int)} and {@link
     * #setMinPauseBetweenCheckpoints(long)}.
     *
     * 此设置定义基本间隔。检查点触发可能会被设置 {@link #setMaxConcurrentCheckpoints(int)}
     * 和 {@link #setMinPauseBetweenCheckpoints(long)} 延迟
     * @param checkpointInterval The checkpoint interval, in milliseconds.
     */
    public void setCheckpointInterval(long checkpointInterval) {
        if (checkpointInterval < MINIMAL_CHECKPOINT_TIME) {
            throw new IllegalArgumentException(
                    String.format(
                            "Checkpoint interval must be larger than or equal to %s ms",
                            MINIMAL_CHECKPOINT_TIME));
        }
        this.checkpointInterval = checkpointInterval;
    }

    /**
     * Gets the maximum time that a checkpoint may take before being discarded.
     * 
     * @return The checkpoint timeout, in milliseconds.
     */
    public long getCheckpointTimeout() {
        return checkpointTimeout;
    }

    /**
     * Sets the maximum time that a checkpoint may take before being discarded.
     *
     * @param checkpointTimeout The checkpoint timeout, in milliseconds.
     */
    public void setCheckpointTimeout(long checkpointTimeout) {
        if (checkpointTimeout < MINIMAL_CHECKPOINT_TIME) {
            throw new IllegalArgumentException(
                    String.format(
                            "Checkpoint timeout must be larger than or equal to %s ms",
                            MINIMAL_CHECKPOINT_TIME));
        }
        this.checkpointTimeout = checkpointTimeout;
    }

    /**
     * Gets the minimal pause between checkpointing attempts. This setting defines how soon the
     * checkpoint coordinator may trigger another checkpoint after it becomes possible to trigger
     * another checkpoint with respect to the maximum number of concurrent checkpoints (see {@link
     * #getMaxConcurrentCheckpoints()}).
     *
     * @return The minimal pause before the next checkpoint is triggered.
     */
    public long getMinPauseBetweenCheckpoints() {
        return minPauseBetweenCheckpoints;
    }

    /**
     * Sets the minimal pause between checkpointing attempts. This setting defines how soon the
     * checkpoint coordinator may trigger another checkpoint after it becomes possible to trigger
     * another checkpoint with respect to the maximum number of concurrent checkpoints (see {@link
     * #setMaxConcurrentCheckpoints(int)}).
     *
     * <p>If the maximum number of concurrent checkpoints is set to one, this setting makes
     * effectively sure that a minimum amount of time passes where no checkpoint is in progress at
     * all.
     *
     * @param minPauseBetweenCheckpoints The minimal pause before the next checkpoint is triggered.
     */
    public void setMinPauseBetweenCheckpoints(long minPauseBetweenCheckpoints) {
        if (minPauseBetweenCheckpoints < 0) {
            throw new IllegalArgumentException("Pause value must be zero or positive");
        }
        this.minPauseBetweenCheckpoints = minPauseBetweenCheckpoints;
    }

    /**
     * Gets the maximum number of checkpoint attempts that may be in progress at the same time. If
     * this value is <i>n</i>, then no checkpoints will be triggered while <i>n</i> checkpoint
     * attempts are currently in flight. For the next checkpoint to be triggered, one checkpoint
     * attempt would need to finish or expire.
     *
     * @return The maximum number of concurrent checkpoint attempts.
     */
    public int getMaxConcurrentCheckpoints() {
        return maxConcurrentCheckpoints;
    }

    /**
     * Sets the maximum number of checkpoint attempts that may be in progress at the same time. If
     * this value is <i>n</i>, then no checkpoints will be triggered while <i>n</i> checkpoint
     * attempts are currently in flight. For the next checkpoint to be triggered, one checkpoint
     * attempt would need to finish or expire.
     * 设置可能同时进行的检查点尝试的最大次数。如果此值为 <i>n<i>,
     * 则当 <i>n<i> 个检查点尝试当前正在进行时,不会触发任何检查点。
     * 要触发下一个检查点,一次检查点尝试需要完成或过期。
     * @param maxConcurrentCheckpoints The maximum number of concurrent checkpoint attempts.
     */
    public void setMaxConcurrentCheckpoints(int maxConcurrentCheckpoints) {
        if (maxConcurrentCheckpoints < 1) {
            throw new IllegalArgumentException(
                    "The maximum number of concurrent attempts must be at least one.");
        }
        this.maxConcurrentCheckpoints = maxConcurrentCheckpoints;
    }

    /**
     * Checks whether checkpointing is forced, despite currently non-checkpointable iteration
     * feedback.
     *
     * @return True, if checkpointing is forced, false otherwise.
     * @deprecated This will be removed once iterations properly participate in checkpointing.
     */
    @Deprecated
    @PublicEvolving
    public boolean isForceCheckpointing() {
        return forceCheckpointing;
    }

    /**
     * Checks whether checkpointing is forced, despite currently non-checkpointable iteration
     * feedback.
     *
     * @param forceCheckpointing The flag to force checkpointing.
     * @deprecated This will be removed once iterations properly participate in checkpointing.
     */
    @Deprecated
    @PublicEvolving
    public void setForceCheckpointing(boolean forceCheckpointing) {
        this.forceCheckpointing = forceCheckpointing;
    }

    /**
     * Checks whether unaligned checkpoints are forced, despite iteration feedback.
     *
     * @return True, if unaligned checkpoints are forced, false otherwise.
     */
    @PublicEvolving
    public boolean isForceUnalignedCheckpoints() {
        return forceUnalignedCheckpoints;
    }

    /**
     * Checks whether unaligned checkpoints are forced, despite currently non-checkpointable
     * iteration feedback or custom partitioners.
     *
     * @param forceUnalignedCheckpoints The flag to force unaligned checkpoints.
     */
    @PublicEvolving
    public void setForceUnalignedCheckpoints(boolean forceUnalignedCheckpoints) {
        this.forceUnalignedCheckpoints = forceUnalignedCheckpoints;
    }

    /**
     * This determines the behaviour when meeting checkpoint errors. If this returns true, which is
     * equivalent to get tolerableCheckpointFailureNumber as zero, job manager would fail the whole
     * job once it received a decline checkpoint message. If this returns false, which is equivalent
     * to get tolerableCheckpointFailureNumber as the maximum of integer (means unlimited), job
     * manager would not fail the whole job no matter how many declined checkpoints it received.
     *
     * @deprecated Use {@link #getTolerableCheckpointFailureNumber()}.
     */
    @Deprecated
    public boolean isFailOnCheckpointingErrors() {
        return failOnCheckpointingErrors;
    }

    /**
     * Sets the expected behaviour for tasks in case that they encounter an error when
     * checkpointing. If this is set as true, which is equivalent to set
     * tolerableCheckpointFailureNumber as zero, job manager would fail the whole job once it
     * received a decline checkpoint message. If this is set as false, which is equivalent to set
     * tolerableCheckpointFailureNumber as the maximum of integer (means unlimited), job manager
     * would not fail the whole job no matter how many declined checkpoints it received.
     *
     * 设置任务的预期行为,以防它们在检查点时遇到错误。
     * 如果将其设置为 true,相当于将 tolerableCheckpointFailureNumber 设置为零,
     * 则一旦收到拒绝检查点消息,作业管理器将使整个作业失败。
     * 如果将其设置为 false,相当于将 tolerableCheckpointFailureNumber 设置为整数的最大值(意味着无限制),
     * 则无论收到多少拒绝的检查点,作业管理器都不会使整个作业失败。
     *
     * <p>{@link #setTolerableCheckpointFailureNumber(int)} would always overrule this deprecated
     * method if they have conflicts.
     *
     * @deprecated Use {@link #setTolerableCheckpointFailureNumber(int)}.
     */
    @Deprecated
    public void setFailOnCheckpointingErrors(boolean failOnCheckpointingErrors) {
        if (tolerableCheckpointFailureNumber != UNDEFINED_TOLERABLE_CHECKPOINT_NUMBER) {
            LOG.warn(
                    "Since tolerableCheckpointFailureNumber has been configured as {}, deprecated #setFailOnCheckpointingErrors(boolean) "
                            + "method would not take any effect and please use #setTolerableCheckpointFailureNumber(int) method to "
                            + "determine your expected behaviour when checkpoint errors on task side.",
                    tolerableCheckpointFailureNumber);
            return;
        }
        this.failOnCheckpointingErrors = failOnCheckpointingErrors;
        if (failOnCheckpointingErrors) {
            this.tolerableCheckpointFailureNumber = 0;
        } else {
            this.tolerableCheckpointFailureNumber = UNLIMITED_TOLERABLE_FAILURE_NUMBER;
        }
    }

    /**
     * Get the tolerable checkpoint failure number which used by the checkpoint failure manager to
     * determine when we need to fail the job.
     * 通过使用检查点故障器获取可容忍检查点失败的数量以确认我们何时使作业失败
     * <p>If the {@link #tolerableCheckpointFailureNumber} has not been configured, this method
     * would return 0 which means the checkpoint failure manager would not tolerate any declined
     * checkpoint failure.
     * 如果未配置 {@link #tolerableCheckpointFailureNumber},
     * 则此方法将返回 0,这意味着检查点故障管理器不会容忍任何拒绝的检查点故障。
     */
    public int getTolerableCheckpointFailureNumber() {
        if (tolerableCheckpointFailureNumber == UNDEFINED_TOLERABLE_CHECKPOINT_NUMBER) {
            return 0;
        }
        return tolerableCheckpointFailureNumber;
    }

    /**
     * Set the tolerable checkpoint failure number, the default value is 0 that means we do not
     * tolerance any checkpoint failure.
     * 设置可容忍的检查点失败数,默认值为0,表示我们不容忍任何检查点失败。
     */
    public void setTolerableCheckpointFailureNumber(int tolerableCheckpointFailureNumber) {
        if (tolerableCheckpointFailureNumber < 0) {
            throw new IllegalArgumentException(
                    "The tolerable failure checkpoint number must be non-negative.");
        }
        this.tolerableCheckpointFailureNumber = tolerableCheckpointFailureNumber;
    }

    /**
     * Enables checkpoints to be persisted externally.
     * 使检查点能够在外部持久化。
     *
     * <p>Externalized checkpoints write their meta data out to persistent storage and are
     * <strong>not</strong> automatically cleaned up when the owning job fails or is suspended
     * (terminating with job status {@link JobStatus#FAILED} or {@link JobStatus#SUSPENDED}). In
     * this case, you have to manually clean up the checkpoint state, both the meta data and actual
     * program state.
     * 外部化检查点将其元数据写入持久存储,并且在拥有的作业失败或暂停(以作业状态 {@link JobStatus#FAILED}
     * 或 {@link JobStatus#SUSPENDED} 终止)时<strong>不会<strong>自动清理。
     * 在这种情况下,您必须手动清理检查点状态,元数据和实际程序状态。
     *
     * <p>The {@link ExternalizedCheckpointCleanup} mode defines how an externalized checkpoint
     * should be cleaned up on job cancellation. If you choose to retain externalized checkpoints on
     * cancellation you have you handle checkpoint clean up manually when you cancel the job as well
     * (terminating with job status {@link JobStatus#CANCELED}).
     * {@link ExternalizedCheckpointCleanup} 模式定义了在取消作业时应如何清理外部化检查点。
     * 如果您选择在取消时保留外部化检查点,则您还可以在取消作业时手动处理检查点清理
     * (以作业状态 {@link JobStatus#CANCELED} 终止)。
     *
     * <p>The target directory for externalized checkpoints is configured via {@link
     * org.apache.flink.configuration.CheckpointingOptions#CHECKPOINTS_DIRECTORY}.
     *
     * @param cleanupMode Externalized checkpoint cleanup behaviour.
     */
    @PublicEvolving
    public void enableExternalizedCheckpoints(ExternalizedCheckpointCleanup cleanupMode) {
        this.externalizedCheckpointCleanup = checkNotNull(cleanupMode);
    }

    /**
     * Returns whether checkpoints should be persisted externally.
     * 返回检查点是否应该在外部持久化。
     * @return <code>true</code> if checkpoints should be externalized.
     */
    @PublicEvolving
    public boolean isExternalizedCheckpointsEnabled() {
        return externalizedCheckpointCleanup != null;
    }

    /**
     * Returns whether a job recovery should fallback to checkpoint when there is a more recent
     * savepoint.
     * 返回当存在更新的保存点时作业恢复是否应回退到检查点。
     * @return <code>true</code> if a job recovery should fallback to checkpoint.
     * @deprecated Don't activate prefer checkpoints for recovery because it can lead to data loss
     *     and duplicate output. This option will soon be removed. See <a
     *     href="https://issues.apache.org/jira/browse/FLINK-20427">FLINK-20427</a> for more
     *     information.
     */
    @PublicEvolving
    @Deprecated
    public boolean isPreferCheckpointForRecovery() {
        return preferCheckpointForRecovery;
    }

    /**
     * Sets whether a job recovery should fallback to checkpoint when there is a more recent
     * savepoint.
     * 设置作业恢复是否应在有更新保存点时回退到检查点。
     * @deprecated Don't activate prefer checkpoints for recovery because it can lead to data loss
     *     and duplicate output. This option will soon be removed. See <a
     *     href="https://issues.apache.org/jira/browse/FLINK-20427">FLINK-20427</a> for more
     *     information.
     */
    @PublicEvolving
    @Deprecated
    public void setPreferCheckpointForRecovery(boolean preferCheckpointForRecovery) {
        this.preferCheckpointForRecovery = preferCheckpointForRecovery;
    }

    /**
     * Enables unaligned checkpoints, which greatly reduce checkpointing times under backpressure.
     * 启用未对齐的检查点,这大大减少了背压下的检查点时间。
     * <p>Unaligned checkpoints contain data stored in buffers as part of the checkpoint state,
     * which allows checkpoint barriers to overtake these buffers. Thus, the checkpoint duration
     * becomes independent of the current throughput as checkpoint barriers are effectively not
     * embedded into the stream of data anymore.
     * 未对齐的检查点包含作为检查点状态的一部分存储在缓冲区中的数据,这允许检查点屏障超越这些缓冲区。
     * 因此,检查点持续时间变得独立于当前吞吐量,因为检查点屏障不再有效地嵌入到数据流中。
     * <p>Unaligned checkpoints can only be enabled if {@link #checkpointingMode} is {@link
     * CheckpointingMode#EXACTLY_ONCE}.
     * 未对齐的检查点只能在 {@link #checkpointingMode} 为 {@link CheckpointingMode#EXACTLY_ONCE} 时启用。
     * @param enabled Flag to indicate whether unaligned are enabled.
     */
    @PublicEvolving
    public void enableUnalignedCheckpoints(boolean enabled) {
        unalignedCheckpointsEnabled = enabled;
    }

    /**
     * Enables unaligned checkpoints, which greatly reduce checkpointing times under backpressure.
     * 启用未对齐的检查点,这大大减少了背压下的检查点时间
     * <p>Unaligned checkpoints contain data stored in buffers as part of the checkpoint state,
     * which allows checkpoint barriers to overtake these buffers. Thus, the checkpoint duration
     * becomes independent of the current throughput as checkpoint barriers are effectively not
     * embedded into the stream of data anymore.
     *
     * <p>Unaligned checkpoints can only be enabled if {@link #checkpointingMode} is {@link
     * CheckpointingMode#EXACTLY_ONCE}.
     */
    @PublicEvolving
    public void enableUnalignedCheckpoints() {
        enableUnalignedCheckpoints(true);
    }

    /**
     * Returns whether unaligned checkpoints are enabled.
     *
     * @return <code>true</code> if unaligned checkpoints are enabled.
     */
    @PublicEvolving
    public boolean isUnalignedCheckpointsEnabled() {
        return unalignedCheckpointsEnabled;
    }

    /**
     * Only relevant if {@link #unalignedCheckpointsEnabled} is enabled.
     * 仅在启用 {@link #unalignedCheckpointsEnabled} 时相关
     *
     * <p>If {@link #alignedCheckpointTimeout} has value equal to <code>0</code>, checkpoints will
     * always start unaligned.
     * 如果 {@linkalignedCheckpointTimeout} 的值等于 <code>0<code>,检查点将始终未对齐开始。
     *
     * <p>If {@link #alignedCheckpointTimeout} has value greater then <code>0</code>, checkpoints
     * will start aligned. If during checkpointing, checkpoint start delay exceeds this {@link
     * #alignedCheckpointTimeout}, alignment will timeout and checkpoint will start working as
     * unaligned checkpoint.
     *如果 {@linkalignedCheckpointTimeout} 的值大于 <code>0<code>,检查点将开始对齐。
     * 如果在检查点期间,检查点启动延迟超过此 {@linkalignedCheckpointTimeout},
     * 对齐将超时,检查点将开始作为未对齐的检查点工作。
     *
     * @deprecated Use {@link #setAlignedCheckpointTimeout(Duration)} instead.
     */
    @Deprecated
    @PublicEvolving
    public void setAlignmentTimeout(Duration alignmentTimeout) {
        this.alignedCheckpointTimeout = alignmentTimeout;
    }

    /**
     * @return value of alignment timeout, as configured via {@link #setAlignmentTimeout(Duration)}
     *     or {@link ExecutionCheckpointingOptions#ALIGNMENT_TIMEOUT}.
     * @deprecated User {@link #getAlignedCheckpointTimeout()} instead.
     */
    @Deprecated
    @PublicEvolving
    public Duration getAlignmentTimeout() {
        return alignedCheckpointTimeout;
    }

    /**
     * @return value of alignment timeout, as configured via {@link
     *     #setAlignedCheckpointTimeout(Duration)} or {@link
     *     ExecutionCheckpointingOptions#ALIGNED_CHECKPOINT_TIMEOUT}.
     */
    @PublicEvolving
    public Duration getAlignedCheckpointTimeout() {
        return alignedCheckpointTimeout;
    }

    /**
     * Only relevant if {@link #unalignedCheckpointsEnabled} is enabled.
     * 仅在启用 {@link #unalignedCheckpointsEnabled} 时相关
     * <p>If {@link #alignedCheckpointTimeout} has value equal to <code>0</code>, checkpoints will
     * always start unaligned.
     *
     * <p>If {@link #alignedCheckpointTimeout} has value greater then <code>0</code>, checkpoints
     * will start aligned. If during checkpointing, checkpoint start delay exceeds this {@link
     * #alignedCheckpointTimeout}, alignment will timeout and checkpoint will start working as
     * unaligned checkpoint.
     * 如果 {@linkalignedCheckpointTimeout} 的值大于 <code>0<code>,检查点将开始对齐。
     * 如果在检查点期间,检查点启动延迟超过此 {@linkalignedCheckpointTimeout},对齐将超时,
     * 检查点将开始作为未对齐的检查点工作。
     */
    @PublicEvolving
    public void setAlignedCheckpointTimeout(Duration alignedCheckpointTimeout) {
        this.alignedCheckpointTimeout = alignedCheckpointTimeout;
    }

    /**
     * Returns whether approximate local recovery is enabled.
     *
     * @return <code>true</code> if approximate local recovery is enabled.
     */
    @Experimental
    public boolean isApproximateLocalRecoveryEnabled() {
        return approximateLocalRecovery;
    }

    /**
     * Enables the approximate local recovery mode.
     *
     * <p>In this recovery mode, when a task fails, the entire downstream of the tasks (including
     * the failed task) restart.
     *
     * <p>Notice that 1. Approximate recovery may lead to data loss. The amount of data which leads
     * the failed task from the state of the last completed checkpoint to the state when the task
     * fails is lost. 2. In the next version, we will support restarting the set of failed set of
     * tasks only. In this version, we only support downstream restarts when a task fails. 3. It is
     * only an internal feature for now.
     *
     * @param enabled Flag to indicate whether approximate local recovery is enabled .
     */
    @Experimental
    public void enableApproximateLocalRecovery(boolean enabled) {
        approximateLocalRecovery = enabled;
    }

    /**
     * Returns the cleanup behaviour for externalized checkpoints.
     *
     * @return The cleanup behaviour for externalized checkpoints or <code>null</code> if none is
     *     configured.
     */
    @PublicEvolving
    public ExternalizedCheckpointCleanup getExternalizedCheckpointCleanup() {
        return externalizedCheckpointCleanup;
    }

    /**
     * CheckpointStorage defines how {@link StateBackend}'s checkpoint their state for fault
     * tolerance in streaming applications. Various implementations store their checkpoints in
     * different fashions and have different requirements and availability guarantees.
     * CheckpointStorage 定义了 {@link StateBackend} 如何检查它们的状态以在流应用程序中进行容错。
     * 各种实现以不同的方式存储它们的检查点,并具有不同的要求和可用性保证。
     *
     * <p>For example, {@link org.apache.flink.runtime.state.storage.JobManagerCheckpointStorage
     * JobManagerCheckpointStorage} stores checkpoints in the memory of the JobManager. It is
     * lightweight and without additional dependencies but is not highly available and only supports
     * small state sizes. This checkpoint storage policy is convenient for local testing and
     * development.
     * 例如{@link org.apache.flink.runtime.state.storage.JobManagerCheckpointStorage
     * JobManagerCheckpointStorage}在JobManager的内存中存储checkpoint。
     * 它是轻量级的,没有额外的依赖,但不是高度可用的,只支持小状态大小。这种检查点存储策略便于本地测试和开发
     * <p>{@link org.apache.flink.runtime.state.storage.FileSystemCheckpointStorage
     *
     * FileSystemCheckpointStorage} stores checkpoints in a filesystem. For systems like HDFS, NFS
     * Drives, S3, and GCS, this storage policy supports large state size, in the magnitude of many
     * terabytes while providing a highly available foundation for stateful applications. This
     * checkpoint storage policy is recommended for most production deployments.
     * FileSystemCheckpointStorage} 将检查点存储在文件系统中。
     * 对于 HDFS、NFS 驱动器、S3 和 GCS 等系统,此存储策略支持大状态大小,达到数 TB,
     * 同时为有状态应用程序提供高度可用的基础。对于大多数生产部署,建议使用此检查点存储策略。
     * @param storage The checkpoint storage policy.
     */
    @PublicEvolving
    public void setCheckpointStorage(CheckpointStorage storage) {
        Preconditions.checkNotNull(storage, "Checkpoint storage must not be null");
        this.storage = storage;
    }

    /**
     * Configures the application to write out checkpoint snapshots to the configured directory. See
     * {@link FileSystemCheckpointStorage} for more details on checkpointing to a file system.
     * 配置应用程序以将检查点快照写出到配置的目录。有关检查点到文件系统的更多详细信息,
     * 请参阅 {@link FileSystemCheckpointStorage}。
     * @param checkpointDirectory The path to write checkpoint metadata to.
     * @see #setCheckpointStorage(CheckpointStorage)
     */
    @PublicEvolving
    public void setCheckpointStorage(String checkpointDirectory) {
        Preconditions.checkNotNull(checkpointDirectory, "Checkpoint directory must not be null");
        this.storage = new FileSystemCheckpointStorage(checkpointDirectory);
    }

    /**
     * Configures the application to write out checkpoint snapshots to the configured directory. See
     * {@link FileSystemCheckpointStorage} for more details on checkpointing to a file system.
     * 配置应用程序以将检查点快照写出到配置的目录。
     * 有关检查点到文件系统的更多详细信息,请参阅 {@link FileSystemCheckpointStorage}。
     * @param checkpointDirectory The path to write checkpoint metadata to.
     * 写入检查点元数据的路径。
     * @see #setCheckpointStorage(CheckpointStorage)
     */
    @PublicEvolving
    public void setCheckpointStorage(URI checkpointDirectory) {
        Preconditions.checkNotNull(checkpointDirectory, "Checkpoint directory must not be null");
        this.storage = new FileSystemCheckpointStorage(checkpointDirectory);
    }

    /**
     * Configures the application to write out checkpoint snapshots to the configured directory. See
     * {@link FileSystemCheckpointStorage} for more details on checkpointing to a file system.
     *
     * @param checkpointDirectory The path to write checkpoint metadata to.
     * @see #setCheckpointStorage(String)
     */
    @PublicEvolving
    public void setCheckpointStorage(Path checkpointDirectory) {
        Preconditions.checkNotNull(checkpointDirectory, "Checkpoint directory must not be null");
        this.storage = new FileSystemCheckpointStorage(checkpointDirectory);
    }

    /**
     * @return The {@link CheckpointStorage} that has been configured for the job. Or {@code null}
     *     if none has been set.
     * @see #setCheckpointStorage(CheckpointStorage)
     */
    @Nullable
    @PublicEvolving
    public CheckpointStorage getCheckpointStorage() {
        return this.storage;
    }

    /**
     * Setup the checkpoint id for which the in-flight data will be ignored for all operators in
     * case of the recovery from this checkpoint.
     *
     * @param checkpointIdOfIgnoredInFlightData Checkpoint id for which in-flight data should be
     *     ignored.
     * @see #setCheckpointIdOfIgnoredInFlightData
     */
    @PublicEvolving
    public void setCheckpointIdOfIgnoredInFlightData(long checkpointIdOfIgnoredInFlightData) {
        this.checkpointIdOfIgnoredInFlightData = checkpointIdOfIgnoredInFlightData;
    }

    /**
     * @return Checkpoint id for which in-flight data should be ignored.
     * @see #setCheckpointIdOfIgnoredInFlightData
     */
    @PublicEvolving
    public long getCheckpointIdOfIgnoredInFlightData() {
        return checkpointIdOfIgnoredInFlightData;
    }

    /** Cleanup behaviour for externalized checkpoints when the job is cancelled. */
    @PublicEvolving
    public enum ExternalizedCheckpointCleanup {

        /**
         * Delete externalized checkpoints on job cancellation.
         *
         * <p>All checkpoint state will be deleted when you cancel the owning job, both the meta
         * data and actual program state. Therefore, you cannot resume from externalized checkpoints
         * after the job has been cancelled.
         * 当您取消拥有的作业时,所有检查点状态都将被删除,包括元数据和实际程序状态。
         * 因此,在取消作业后,您无法从外部化检查点恢复。
         * <p>Note that checkpoint state is always kept if the job terminates with state {@link
         * JobStatus#FAILED}.
         */
        DELETE_ON_CANCELLATION(true),

        /**
         * Retain externalized checkpoints on job cancellation.
         *
         * <p>All checkpoint state is kept when you cancel the owning job. You have to manually
         * delete both the checkpoint meta data and actual program state after cancelling the job.
         * 当您取消拥有的作业时,所有检查点状态都会保留。
         * 取消作业后,您必须手动删除检查点元数据和实际程序状态。
         * <p>Note that checkpoint state is always kept if the job terminates with state {@link
         * JobStatus#FAILED}.
         * 请注意,如果作业以状态 {@link JobStatus#FAILED} 终止,则始终保持检查点状态。
         */
        RETAIN_ON_CANCELLATION(false);

        private final boolean deleteOnCancellation;

        ExternalizedCheckpointCleanup(boolean deleteOnCancellation) {
            this.deleteOnCancellation = deleteOnCancellation;
        }

        /**
         * Returns whether persistent checkpoints shall be discarded on cancellation of the job.
         * 返回在取消作业时是否应丢弃持久检查点。
         * @return <code>true</code> if persistent checkpoints shall be discarded on cancellation of
         *     the job.
         */
        public boolean deleteOnCancellation() {
            return deleteOnCancellation;
        }
    }

    /**
     * Sets all relevant options contained in the {@link ReadableConfig} such as e.g. {@link
     * ExecutionCheckpointingOptions#CHECKPOINTING_MODE}.
     *设置 {@link ReadableConfig} 中包含的所有相关选项,
     * 例如{@link ExecutionCheckpointingOptions#CHECKPOINTING_MODE}。
     * <p>It will change the value of a setting only if a corresponding option was set in the {@code
     * configuration}. If a key is not present, the current value of a field will remain untouched.
     *
     * @param configuration a configuration to read the values from
     */
    public void configure(ReadableConfig configuration) {
        configuration
                .getOptional(ExecutionCheckpointingOptions.CHECKPOINTING_MODE)
                .ifPresent(this::setCheckpointingMode);
        configuration
                .getOptional(ExecutionCheckpointingOptions.CHECKPOINTING_INTERVAL)
                .ifPresent(i -> this.setCheckpointInterval(i.toMillis()));
        configuration
                .getOptional(ExecutionCheckpointingOptions.CHECKPOINTING_TIMEOUT)
                .ifPresent(t -> this.setCheckpointTimeout(t.toMillis()));
        configuration
                .getOptional(ExecutionCheckpointingOptions.MAX_CONCURRENT_CHECKPOINTS)
                .ifPresent(this::setMaxConcurrentCheckpoints);
        configuration
                .getOptional(ExecutionCheckpointingOptions.MIN_PAUSE_BETWEEN_CHECKPOINTS)
                .ifPresent(m -> this.setMinPauseBetweenCheckpoints(m.toMillis()));
        configuration
                .getOptional(ExecutionCheckpointingOptions.PREFER_CHECKPOINT_FOR_RECOVERY)
                .ifPresent(this::setPreferCheckpointForRecovery);
        configuration
                .getOptional(ExecutionCheckpointingOptions.TOLERABLE_FAILURE_NUMBER)
                .ifPresent(this::setTolerableCheckpointFailureNumber);
        configuration
                .getOptional(ExecutionCheckpointingOptions.EXTERNALIZED_CHECKPOINT)
                .ifPresent(this::enableExternalizedCheckpoints);
        configuration
                .getOptional(ExecutionCheckpointingOptions.ENABLE_UNALIGNED)
                .ifPresent(this::enableUnalignedCheckpoints);
        configuration
                .getOptional(ExecutionCheckpointingOptions.CHECKPOINT_ID_OF_IGNORED_IN_FLIGHT_DATA)
                .ifPresent(this::setCheckpointIdOfIgnoredInFlightData);
        configuration
                .getOptional(ExecutionCheckpointingOptions.ALIGNED_CHECKPOINT_TIMEOUT)
                .ifPresent(this::setAlignedCheckpointTimeout);
        configuration
                .getOptional(ExecutionCheckpointingOptions.FORCE_UNALIGNED)
                .ifPresent(this::setForceUnalignedCheckpoints);
    }
}


总结

因为这部分很多配置前面已经讲过了,所以咱们也就不一一拉出来单独来说了,看看就行,加深一点影响

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值