要理解DAGScheduler,首先就得了解RDD的生命周期。RDD是什么?且看它的全称 Resilient Distributed Datasets,弹性式分布数据集。没错,RDD是一种数据结构,这种数据结构自带了很多方法,这些方法可分为两种:transformation 和 action。在这两种操作中,只有action操作会 触发job。且看常用的action操作有哪些:
所有直接实现的action操作都会触发job(注:有些算子是调用其它算子实现的,如first()算子是调用take()算子实现的),具体代码为:
sc.runJob(this, func,....)
runJob()是SparkContext中的方法,而这个方法最终用调用了DAGScheduler中的runJob方法,具体代码为:
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
DAGScheduler中的runJob()会调用submit():
submitJob(rdd, func, partitions, callSite, resultHandler, properties)
而submitJob() 会把job提交到eventProcessLoop线程:
eventProcessLoop.post(JobSubmitted(jobId, rdd, func2, partitions.toArray, callSite, waiter,SerializationUtils.clone(properties)))
类DAGSchedulerEventProcessLoop有一个监听方法onReceive,这个方法会调用doOnReceive方法处理各种case,其代码如下:
def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
case JobSubmitted(...) =>
dagScheduler.handleJobSubmitted(...)
......//此处省略了其它无关代码
}
而handleJobSubmitted()方法,首先创建finalStage,最终调用submitStage方法提交创建好的finalStage,其主要代码如下:
def handleJobSubmitted(...):{
//创建finalStage
finalStage = newResultStage(
finalRDD, func, partitions, jobId,callSite)
//创建job
val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
//创建好的job加到finalStage中
finalStage.setActiveJob(job)
//最后提交创建的finalStage
submitStage(finalStage)
......//此处省略其它代码 }
接下来看看submitStage方法做了哪些事情:
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
val missing = getMissingParentStages(stage).sortBy(_.id)//根据传入finalStage查看是否有遗漏的parent stage
logDebug("missing: " + missing)
if (missing.isEmpty) {//如果所有parent stage都已经提交,则调用submitMissingTasks方法
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
submitMissingTasks(stage, jobId.get)
} else {
for (parent <- missing) {
submitStage(parent)//如果有遗漏的parent stage,则递归调用submitStage方法,使得parent stage先被处理
}
waitingStages += stage
}
} else {
}
abortStage(stage, "No active job for stage " + stage.id, None)
}
}
需要注意的是在getMissingParentStages()方法中,会根据rdd的dependency做不同的处理,代码如下:
private def getMissingParentStages(stage: Stage): List[Stage] = {
val missing = new HashSet[Stage]
val visited = new HashSet[RDD[_]]
val waitingForVisit = new Stack[RDD[_]]
def visit(rdd: RDD[_]) {
if (!visited(rdd)) {
visited += rdd
val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)
if (rddHasUncachedPartitions) {
for (dep <- rdd.dependencies) {
dep match {
case shufDep: ShuffleDependency[_, _, _] =>
val mapStage = getShuffleMapStage(shufDep, stage.firstJobId)//如果是shuffle dependency,则对相应的finalStage生成parent stage
if (!mapStage.isAvailable) {
missing += mapStage
}
case narrowDep: NarrowDependency[_] =>
waitingForVisit.push(narrowDep.rdd)//如果是窄依赖的话,则不生成新的stage
}
}
}
}
}
waitingForVisit.push(stage.rdd)
while (waitingForVisit.nonEmpty) {
visit(waitingForVisit.pop())
}
missing.toList
}
接下来看看submitMissingTasks做了哪些事情,其主要代码如下(省略了大部分代码,只列出主要代码):
private def submitMissingTasks(stage: Stage, jobId: Int) {
runningStages += stage
stage match {
case s: ShuffleMapStage =>
outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1)
case s: ResultStage =>
outputCommitCoordinator.stageStart(
stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)
}
val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
stage match {
case s: ShuffleMapStage =>
partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
case s: ResultStage =>
val job = s.activeJob.get
partitionsToCompute.map { id =>
val p = s.partitions(id)
(id, getPreferredLocs(stage.rdd, p))
}.toMap
}
} catch {
......
}
// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
// the serialized copy of the RDD and for each task we will deserialize it, which means each
// task gets a different copy of the RDD. This provides stronger isolation between tasks that
// might modify state of objects referenced in their closures. This is necessary in Hadoop
// where the JobConf/Configuration object is not thread-safe.
var taskBinary: Broadcast[Array[Byte]] = null
try {
// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
// For ResultTask, serialize and broadcast (rdd, func).
val taskBinaryBytes: Array[Byte] = stage match {
case stage: ShuffleMapStage =>
closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()
case stage: ResultStage =>
closureSerializer.serialize((stage.rdd, stage.func): AnyRef).array()
}
taskBinary = sc.broadcast(taskBinaryBytes)
} catch {
......
}
val tasks: Seq[Task[_]] = try {
stage match {
case stage: ShuffleMapStage =>
partitionsToCompute.map { id =>
val locs = taskIdToLocations(id)
val part = stage.rdd.partitions(id)
new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, stage.internalAccumulators)
}
case stage: ResultStage =>
val job = stage.activeJob.get
partitionsToCompute.map { id =>
val p: Int = stage.partitions(id)
val part = stage.rdd.partitions(p)
val locs = taskIdToLocations(id)
new ResultTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, id, stage.internalAccumulators)
}
}
} catch {
......
}
if (tasks.size > 0) {
stage.pendingPartitions ++= tasks.map(_.partitionId)
taskScheduler.submitTasks(new TaskSet(
tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
} else {
// Because we posted SparkListenerStageSubmitted earlier, we should mark
// the stage as completed here in case there are no tasks to run
markStageAsFinished(stage, None)
}
}
- 首先把运行的stage加入到runningStage中
- 根据stage的类型,启动stage
- 找到task对应的rdd的物理地址
- 生成序列化的二进制task,然后广播到各个节点。每一个task都会得到一份binary task,这样保证了各个task之间的独立性
- 找到每一个task对应的rdd的物理地址
- 根据stage的类型创建对应的task,如果是shuffleMapStage,则创建ShuffleMapTask;如果是resultStage,则创建ResultTask
- 最后调用taskSchdulerImp的submitTask方法。这样job经过DAGScheduler处理之后,就交给taskScheduler了。