/**
* A stage is a set of parallel tasks all computing the same function that need to run as part
* of a Spark job, where all the tasks have the same shuffle dependencies. Each DAG of tasks run
* by the scheduler is split up into stages at the boundaries where shuffle occurs, and then the
* DAGScheduler runs these stages in topological order.
*
* Each Stage can either be a shuffle map stage, in which case its tasks' results are input for
* other stage(s), or a result stage, in which case its tasks directly compute a Spark action
* (e.g. count(), save(), etc) by running a function on an RDD. For shuffle map stages, we also
* track the nodes that each output partition is on.
*
* Each Stage also has a firstJobId, identifying the job that first submitted the stage. When FIFO
* scheduling is used, this allows Stages from earlier jobs to be computed first or recovered
* faster on failure.
*
* Finally, a single stage can be re-executed in multiple attempts due to fault recovery. In that
* case, the Stage object will track multiple StageInfo objects to pass to listeners or the web UI.
* The latest one will be accessible through latestInfo.
*
* @param id Unique stage ID
* @param rdd RDD that this stage runs on: for a shuffle map stage, it's the RDD we run map tasks
* on, while for a result stage, it's the target RDD that we ran an action on
* @param numTasks Total number of tasks in stage; result stages in particular may not need to
* compute all partitions, e.g. for first(), lookup(), and take().
* @param parents List of stages that this stage depends on (through shuffle dependencies).
* @param firstJobId ID of the first job this stage was part of, for FIFO scheduling.
* @param callSite Location in the user program associated with this stage: either where the target
* RDD was created, for a shuffle map stage, or where the action for a result stage was called.
*/private[scheduler] abstractclassStage(
val id: Int,
val rdd: RDD[_],
val numTasks: Int,
val parents: List[Stage],
val firstJobId: Int,
val callSite: CallSite)extends Logging {
val numPartitions = rdd.partitions.length
/** Set of jobs that this stage belongs to. */val jobIds = new HashSet[Int]
val pendingPartitions = new HashSet[Int]
/** The ID to use for the next new attempt for this stage. */privatevar nextAttemptId: Int = 0val name: String = callSite.shortForm
val details: String = callSite.longForm
/**
* Pointer to the [StageInfo] object for the most recent attempt. This needs to be initialized
* here, before any attempts have actually been created, because the DAGScheduler uses this
* StageInfo to tell SparkListeners when a job starts (which happens before any stage attempts
* have been created).
*/privatevar _latestInfo: StageInfo = StageInfo.fromStage(this, nextAttemptId)
/**
* Set of stage attempt IDs that have failed with a FetchFailure. We keep track of these
* failures in order to avoid endless retries if a stage keeps failing with a FetchFailure.
* We keep track of each attempt ID that has failed to avoid recording duplicate failures if
* multiple tasks from the same stage attempt fail (SPARK-5945).
*/privateval fetchFailedAttemptIds = new HashSet[Int]
private[scheduler] def clearFailures() : Unit = {
fetchFailedAttemptIds.clear()
}
/**
* Check whether we should abort the failedStage due to multiple consecutive fetch failures.
*
* This method updates the running set of failed stage attempts and returns
* true if the number of failures exceeds the allowable number of failures.
*/private[scheduler] def failedOnFetchAndShouldAbort(stageAttemptId: Int): Boolean = {
fetchFailedAttemptIds.add(stageAttemptId)
fetchFailedAttemptIds.size >= Stage.MAX_CONSECUTIVE_FETCH_FAILURES
}
/** Creates a new attempt for this stage by creating a new StageInfo with a new attempt ID. */def makeNewStageAttempt(
numPartitionsToCompute: Int,
taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty): Unit = {
val metrics = new TaskMetrics
metrics.register(rdd.sparkContext)
_latestInfo = StageInfo.fromStage(
this, nextAttemptId, Some(numPartitionsToCompute), metrics, taskLocalityPreferences)
nextAttemptId += 1
}
/** Returns the StageInfo for the most recent attempt for this stage. */def latestInfo: StageInfo = _latestInfo
overridefinaldef hashCode(): Int = id
overridefinaldef equals(other: Any): Boolean = other match {
case stage: Stage => stage != null && stage.id == id
case _ => false
}
/** Returns the sequence of partition ids that are missing (i.e. needs to be computed). */def findMissingPartitions(): Seq[Int]
}
private[scheduler] objectStage {// The number of consecutive failures allowed before a stage is abortedval MAX_CONSECUTIVE_FETCH_FAILURES = 4
}
ShuffleMapStage
/**
* ShuffleMapStages are intermediate stages in the execution DAG that produce data for a shuffle.
* They occur right before each shuffle operation, and might contain multiple pipelined operations
* before that (e.g. map and filter). When executed, they save map output files that can later be
* fetched by reduce tasks. The `shuffleDep` field describes the shuffle each stage is part of,
* and variables like `outputLocs` and `numAvailableOutputs` track how many map outputs are ready.
*
* ShuffleMapStages can also be submitted independently as jobs with DAGScheduler.submitMapStage.
* For such stages, the ActiveJobs that submitted them are tracked in `mapStageJobs`. Note that
* there can be multiple ActiveJobs trying to compute the same shuffle map stage.
*/private[spark] classShuffleMapStage(
id: Int,
rdd: RDD[_],
numTasks: Int,
parents: List[Stage],
firstJobId: Int,
callSite: CallSite,
val shuffleDep: ShuffleDependency[_, _, _])extends Stage(id, rdd, numTasks, parents, firstJobId, callSite) {
private[this] var _mapStageJobs: List[ActiveJob] = Nil
private[this] var _numAvailableOutputs: Int = 0/**
* List of [[MapStatus]] for each partition. The index of the array is the map partition id,
* and each value in the array is the list of possible [[MapStatus]] for a partition
* (a single task might run multiple times).
*/private[this] val outputLocs = Array.fill[List[MapStatus]](numPartitions)(Nil)
overridedef toString: String = "ShuffleMapStage " + id
/**
* Returns the list of active jobs,
* i.e. map-stage jobs that were submitted to execute this stage independently (if any).
*/def mapStageJobs: Seq[ActiveJob] = _mapStageJobs
/** Adds the job to the active job list. */def addActiveJob(job: ActiveJob): Unit = {
_mapStageJobs = job :: _mapStageJobs
}
/** Removes the job from the active job list. */def removeActiveJob(job: ActiveJob): Unit = {
_mapStageJobs = _mapStageJobs.filter(_ != job)
}
/**
* Number of partitions that have shuffle outputs.
* When this reaches [[numPartitions]], this map stage is ready.
* This should be kept consistent as `outputLocs.filter(!_.isEmpty).size`.
*/def numAvailableOutputs: Int = _numAvailableOutputs
/**
* Returns true if the map stage is ready, i.e. all partitions have shuffle outputs.
* This should be the same as `outputLocs.contains(Nil)`.
*/def isAvailable: Boolean = _numAvailableOutputs == numPartitions
/** Returns the sequence of partition ids that are missing (i.e. needs to be computed). */overridedef findMissingPartitions(): Seq[Int] = {
val missing = (0 until numPartitions).filter(id => outputLocs(id).isEmpty)
assert(missing.size == numPartitions - _numAvailableOutputs,
s"${missing.size} missing, expected ${numPartitions - _numAvailableOutputs}")
missing
}
def addOutputLoc(partition: Int, status: MapStatus): Unit = {
val prevList = outputLocs(partition)
outputLocs(partition) = status :: prevList
if (prevList == Nil) {
_numAvailableOutputs += 1
}
}
def removeOutputLoc(partition: Int, bmAddress: BlockManagerId): Unit = {
val prevList = outputLocs(partition)
val newList = prevList.filterNot(_.location == bmAddress)
outputLocs(partition) = newList
if (prevList != Nil && newList == Nil) {
_numAvailableOutputs -= 1
}
}
/**
* Returns an array of [[MapStatus]] (index by partition id). For each partition, the returned
* value contains only one (i.e. the first) [[MapStatus]]. If there is no entry for the partition,
* that position is filled with null.
*/def outputLocInMapOutputTrackerFormat(): Array[MapStatus] = {
outputLocs.map(_.headOption.orNull)
}
/**
* Removes all shuffle outputs associated with this executor. Note that this will also remove
* outputs which are served by an external shuffle server (if one exists), as they are still
* registered with this execId.
*/def removeOutputsOnExecutor(execId: String): Unit = {
var becameUnavailable = falsefor (partition <- 0 until numPartitions) {
val prevList = outputLocs(partition)
val newList = prevList.filterNot(_.location.executorId == execId)
outputLocs(partition) = newList
if (prevList != Nil && newList == Nil) {
becameUnavailable = true
_numAvailableOutputs -= 1
}
}
if (becameUnavailable) {
logInfo("%s is now unavailable on executor %s (%d/%d, %s)".format(
this, execId, _numAvailableOutputs, numPartitions, isAvailable))
}
}
}
spark 2.1 Stage and ResultStage and ShuffleMapStage
Stage/** * A stage is a set of parallel tasks all computing the same function that need to run as part * of a Spark job, where all the tasks have the same shuffle dependencies. Each DAG of tasks run