When job is finished, it will call rdd.doCheckpoint() to checkpoint the rdd.
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
rdd.doCheckpoint()
RDD.checkpoint()
/**
* Mark this RDD for checkpointing. It will be saved to a file inside the checkpoint
* directory set with `SparkContext#setCheckpointDir` and all references to its parent
* RDDs will be removed. This function must be called before any job has been
* executed on this RDD. It is strongly recommended that this RDD is persisted in
* memory, otherwise saving it on a file will require recomputation.
*/
def checkpoint(): Unit = RDDCheckpointData.synchronized {
// NOTE: we use a global lock here due to complexities downstream with ensuring
// children RDD partitions point to the correct parent partitions. In the future
// we should revisit this consideration.
if (context.checkpointDir.isEmpty) {
throw new SparkException("Checkpoint directory has not been set in the SparkContext")
} else if (checkpointData.isEmpty) {
checkpointData = Some(new ReliableRDDCheckpointData(this))
}
}
CheckpointState
/**
* Enumeration to manage state transitions of an RDD through checkpointing
*
* [ Initialized --{@literal >} checkpointing in progress --{@literal >} checkpointed ]
*/
private[spark] object CheckpointState extends Enumeration {
type CheckpointState = Value
val Initialized, CheckpointingInProgress, Checkpointed = Value
}
RDDCheckpointData Definition
/**
* This class contains all the information related to RDD checkpointing. Each instance of this
* class is associated with an RDD. It manages process of checkpointing of the associated RDD,
* as well as, manages the post-checkpoint state by providing the updated partitions,
* iterator and preferred locations of the checkpointed RDD.
*/
private[spark] abstract class RDDCheckpointData[T: ClassTag](@transient private val rdd: RDD[T])
extends Serializable {
import CheckpointState._
import CheckpointState._
// The checkpoint state of the associated RDD.
protected var cpState = Initialized
// The RDD that contains our checkpointed data
private var cpRDD: Option[CheckpointRDD[T]] = None
isCheckpointed
// TODO: are we sure we need to use a global lock in the following methods?
/**
* Return whether the checkpoint data for this RDD is already persisted.
*/
def isCheckpointed: Boolean = RDDCheckpointData.synchronized {
cpState == Checkpointed
}
checkpoint
/**
* Materialize this RDD and persist its content.
* This is called immediately after the first action invoked on this RDD has completed.
*/
final def checkpoint(): Unit = {
// Guard against multiple threads checkpointing the same RDD by
// atomically flipping the state of this RDDCheckpointData
RDDCheckpointData.synchronized {
if (cpState == Initialized) {
cpState = CheckpointingInProgress
} else {
return
}
}
val newRDD = doCheckpoint()
// Update our state and truncate the RDD lineage
RDDCheckpointData.synchronized {
cpRDD = Some(newRDD)
cpState = Checkpointed
rdd.markCheckpointed()
}
}
doCheckpoint() is a abstract method.
/**
* Materialize this RDD and persist its content.
*
* Subclasses should override this method to define custom checkpointing behavior.
* @return the checkpoint RDD created in the process.
*/
protected def doCheckpoint(): CheckpointRDD[T]
checkpointRDD
/**
* Return the RDD that contains our checkpointed data.
* This is only defined if the checkpoint state is `Checkpointed`.
*/
def checkpointRDD: Option[CheckpointRDD[T]] = RDDCheckpointData.synchronized { cpRDD }
getPartitions
/**
* Return the partitions of the resulting checkpoint RDD.
* For tests only.
*/
def getPartitions: Array[Partition] = RDDCheckpointData.synchronized {
cpRDD.map(_.partitions).getOrElse { Array.empty }
}
ReliableRDDCheckpointData
**
* An implementation of checkpointing that writes the RDD data to reliable storage.
* This allows drivers to be restarted on failure with previously computed state.
*/
private[spark] class ReliableRDDCheckpointData[T: ClassTag](@transient private val rdd: RDD[T])
extends RDDCheckpointData[T](rdd) with Logging {
checkpoint dir
// The directory to which the associated RDD has been checkpointed to
// This is assumed to be a non-local path that points to some reliable storage
private val cpDir: String =
ReliableRDDCheckpointData.checkpointPath(rdd.context, rdd.id)
.map(_.toString)
.getOrElse { throw new SparkException("Checkpoint dir must be specified.") }
getCheckpointDir
/**
* Return the directory to which this RDD was checkpointed.
* If the RDD is not checkpointed yet, return None.
*/
def getCheckpointDir: Option[String] = RDDCheckpointData.synchronized {
if (isCheckpointed) {
Some(cpDir.toString)
} else {
None
}
}
object ReliableRDDCheckpointData
private[spark] object ReliableRDDCheckpointData extends Logging {
/** Return the path of the directory to which this RDD's checkpoint data is written. */
def checkpointPath(sc: SparkContext, rddId: Int): Option[Path] = {
sc.checkpointDir.map { dir => new Path(dir, s"rdd-$rddId") }
}
/** Clean up the files associated with the checkpoint data for this RDD. */
def cleanCheckpoint(sc: SparkContext, rddId: Int): Unit = {
checkpointPath(sc, rddId).foreach { path =>
path.getFileSystem(sc.hadoopConfiguration).delete(path, true)
}
}
}