- 阶段数量 = 1 + shuffle依赖的个数
- 任务的数量 = 一个阶段中最后一个RDD的分区数
- Job的数量 = 一个Spark应用中调用行动算子的次数
spark提交作业 调用action算子 --> 调用 RDD 类的runJob方法 --> 调用 SparkContext 类的 dagScheduler.runJob方法
--> DAGScheduler.handleJobSubmitted 方法
生成 finalStage
finalStage = CreateResultStage()
SubmitStage(finalStage) // 在这个方法里面体现出一个分区对应一个task。核心代码是 partitionsToCompute.map
-->CreateResultStage() 方法中 getOrCreateParentStage(),这个方法里面会算出每一个shuffleDependices,就是宽依赖,对每一个宽依赖调用map算子,创建一个stage。就是说按照shuffle切分stage
(1) rdd.foreach()
(2) RDD{ // RDD类
def foreach{ // 方法
sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF))
}}
(3) SparkContext //类
runJob(rdd, func, 0 until rdd.partitions.length)
runJob(rdd, (ctx: TaskContext, it: Iterator[T]) => cleanedFunc(it), partitions)
runJob[T, U](rdd, func, partitions, (index, res) => results(index) = res)
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
(4) DAGScheduler
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
submitJob{
val maxPartitions = rdd.partitions.length
val jobId = nextJobId.getAndIncrement()
eventProcessLoop.post(JobSubmitted(
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.clone(properties)))
}
private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
}
private[scheduler] def handleJobSubmitted {
var finalStage: ResultStage = null
try {
// New stage creation may throw an exception if, for example, jobs are run on a
// HadoopRDD whose underlying HDFS files have been deleted.
finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
}
// 创建job
val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
// 提交stage
submitStage(finalStage)
}
private def createResultStage: ResultStage = {
// 判断有无shuffle,若有就再创建stage
val parents = getOrCreateParentStages(rdd, jobId)
val id = nextStageId.getAndIncrement()
// 无论有无shuffle,一定有一个阶段 ResultStage
val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
stageIdToStage(id) = stage
updateJobIdStageIdMaps(jobId, stage)
stage
}
private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
// 对每一个shuffle依赖进行map算子,也就是说针对每一个shuffle生成一个stage
getShuffleDependencies(rdd).map { shuffleDep =>
getOrCreateShuffleMapStage(shuffleDep, firstJobId)
}.toList
}
private[scheduler] def getShuffleDependencies(
rdd: RDD[_]): HashSet[ShuffleDependency[_, _, _]] = {
val parents = new HashSet[ShuffleDependency[_, _, _]]
val visited = new HashSet[RDD[_]]
val waitingForVisit = new Stack[RDD[_]]
// 将rdd压栈
waitingForVisit.push(rdd)
// 判断栈是否为空,刚刚放进去怎么会为空
while (waitingForVisit.nonEmpty) {
// 弹栈
val toVisit = waitingForVisit.pop()
// 刚刚被弹出来的元素(刚刚放进去的rdd)没有被访问
if (!visited(toVisit)) {
visited += toVisit
// 对rdd的每一个依赖进行遍历
toVisit.dependencies.foreach {
case shuffleDep: ShuffleDependency[_, _, _] =>
// 将依赖放进parents
parents += shuffleDep
case dependency =>
waitingForVisit.push(dependency.rdd)
}
}
}
// 最终返回所有依赖
parents
}
private def submitStage(stage: Stage) {
// 提交 Tasks
submitMissingTasks(stage, jobId.get){
......
val tasks: Seq[Task[_]] = try {
stage match {
case stage: ShuffleMapStage =>
partitionsToCompute.map { id => // 针对每一个分区,new 一个shuffleMapTask,也就是说一个分区就是一个Task
val locs = taskIdToLocations(id)
val part = stage.rdd.partitions(id)
new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, stage.latestInfo.taskMetrics, properties, Option(jobId),
Option(sc.applicationId), sc.applicationAttemptId)
}
case stage: ResultStage =>
partitionsToCompute.map { id =>
val p: Int = stage.partitions(id)
val part = stage.rdd.partitions(p)
val locs = taskIdToLocations(id)
new ResultTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, id, properties, stage.latestInfo.taskMetrics,
Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
}
}
}
......
// 提交任务
taskScheduler.submitTasks(new TaskSet(tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
}
}