spark2.11-2.3源码8_DAGSchedule源码

最新推荐文章于 2023-03-15 06:00:00 发布

caibaoli

最新推荐文章于 2023-03-15 06:00:00 发布

阅读量127

点赞数

分类专栏：实习第4天实习第四天之spark源码学习文章标签： DAGSchedule job stage task 依赖

本文链接：https://blog.csdn.net/u011607686/article/details/86560643

版权

实习第4天同时被 2 个专栏收录

89 篇文章 0 订阅

订阅专栏

实习第四天之spark源码学习

13 篇文章 0 订阅

订阅专栏

调用action操作，进入dagScheduler.runJob，接着submitJob，有eventProcessLoop.post(JobSubmitted()，JobSubmitted里dagScheduler.handleJobSubmitted

handleJobSubmitted

//DAGScheduler的job调度的核心入口
    private[scheduler] def handleJobSubmitted(jobId: Int,
          finalRDD: RDD[_],
          func: (TaskContext, Iterator[_]) => _,
          partitions: Array[Int],
          callSite: CallSite,
          listener: JobListener,
          properties: Properties) {
          //使用触发job的最后一个rdd，创建finalStage
        var finalStage: ResultStage = null
        try {
          // New stage creation may throw an exception if, for example, jobs are run on a
          // HadoopRDD whose underlying HDFS files have been deleted.
          //之前是newStage,现在改成createResultStage，**创建一个Stage对象**，并且将stage加入DAGScheduler内部的内存缓存中
          finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
        } catch {
          case e: Exception =>
            logWarning("Creating new stage failed due to exception - job: " + jobId, e)
            listener.jobFailed(e)
            return
        }
    
        //**用finalStage，创建一个Job**，就是说，这个job的最后一个stage，当然就是我们的finalStage
        val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
        clearCacheLocs()
        logInfo("Got job %s (%s) with %d output partitions".format(
          job.jobId, callSite.shortForm, partitions.length))
        logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
        logInfo("Parents of final stage: " + finalStage.parents)
        logInfo("Missing parents: " + getMissingParentStages(finalStage))
    
        //**将job加入内存缓存中**
        val jobSubmissionTime = clock.getTimeMillis()
        jobIdToActiveJob(jobId) = job
        activeJobs += job
        finalStage.setActiveJob(job)
        val stageIds = jobIdToStageIds(jobId).toArray
        val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
        listenerBus.post(
          SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
        //**submitStage提交finalStage，导致第一个stage提交，其他放入等待队列**
        submitStage(finalStage)
      }

submitStage

stage划分算法:1、从finalStage倒推，2、通过宽依赖，来进行新的stage的划分，3、使用递归，优先提交父stage

//这个其实就是stage划分算法的入口，但是，stage划分算法，其实是由submitStage()方法与getMissingParentStages()方法共同组成的
private def submitStage(stage: Stage) {
    val jobId = activeJobForStage(stage)
    if (jobId.isDefined) {
      logDebug("submitStage(" + stage + ")")
      if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
         //调用getMissingParentStages()方法，去获取当前这个stage的父stage
        val missing = getMissingParentStages(stage).sortBy(_.id)
        logDebug("missing: " + missing)
        //反复递归调用，直到最初的stage,没有父stage了，那么此时，就会去自行提交这个第一个stage,stage0。其余的stage，此时全部都在waitingStages里面
        if (missing.isEmpty) {
          logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
          submitMissingTasks(stage, jobId.get)
        } else {
            //递归调用submit()方法，去提交父stage(重要)
          for (parent <- missing) {
            submitStage(parent)
          }
          //将当前stage，放入waitingStages 等待执行的stage队列中
          waitingStages += stage
        }
      }
    } else {
      abortStage(stage, "No active job for stage " + stage.id, None)
    }
  }

getMissingParentStages

 //获取某个stage的父stage，从final stage往上倒推
 //对一个stage，如果它的最后一个rdd的所有依赖，都是窄依赖，那么就不会创建任何新的stage，但是，只要发现这个stage的rdd宽依赖了某个rdd，那个就用宽依赖的那个rdd，创建一个新的stage，然后立即将新的stage返回
 private def getMissingParentStages(stage: Stage): List[Stage] = {
    val missing = new HashSet[Stage]
    val visited = new HashSet[RDD[_]]
    // We are manually maintaining a stack here to prevent StackOverflowError
    // caused by recursively visiting
    val waitingForVisit = new ArrayStack[RDD[_]]
    def visit(rdd: RDD[_]) {
      if (!visited(rdd)) {
        visited += rdd
        //Nil：val Nil = scala.collection.immutable.即List[Nothing]一个没有元素的List集合
        val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)
        //先判断是否有未cache的分区，若全部都被cache了就不用计算parent Stage了。
        if (rddHasUncachedPartitions) {
          //遍历rdd的依赖
          for (dep <- rdd.dependencies) {            
            dep match {
              //宽依赖
              case shufDep: ShuffleDependency[_, _, _] =>
              //使用宽依赖的那个rdd创建ShuffleMapStage，并且将isShuffleMap设置为true
              //默认最后一个stage，不是shuffleMap stage，但是finalStage之前所有的stage都是shuffleMap stage
                val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)
                if (!mapStage.isAvailable) {
                  missing += mapStage
                }
              //窄依赖，将依赖的rdd放入栈中
              case narrowDep: NarrowDependency[_] =>
                waitingForVisit.push(narrowDep.rdd)
            }
          }
        }
      }
    }
    //首先往栈中，推入了stage最后的一个rdd
    waitingForVisit.push(stage.rdd)
    //然后循环
    while (waitingForVisit.nonEmpty) {
    //对stage的最后一个rdd,调用自已内部定义的visit()方法
      visit(waitingForVisit.pop())
    }
    missing.toList
  }

getOrCreateShuffleMapStage

createShuffleMapStage(dep, firstJobId)

createShuffleMapStage

val stage = new ShuffleMapStage(
      id, rdd, numTasks, parents, jobId, rdd.creationSite, shuffleDep, mapOutputTracker)

ShuffleMapStage

//设置成宽依赖的stage
 val shuffleDep: ShuffleDependency[_, _, _],

submitMissingTasks(stage提交处理)
为stage创建一批task,task数量与partition数量相同

private def submitMissingTasks(stage: Stage, jobId: Int) {
logDebug(“submitMissingTasks(” + stage + “)”)

// First figure out the indexes of partition ids to compute.
//获取要创建的task的数量 
val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()

// Use the scheduling pool, job group, description, etc. from an ActiveJob associated
// with this Stage
val properties = jobIdToActiveJob(jobId).properties
//将stage加入stage运行队列
runningStages += stage

  stage match {
      case s: ShuffleMapStage =>
        outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1)
      case s: ResultStage =>
        outputCommitCoordinator.stageStart(
          stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)
    }

 val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
  stage match {
    case s: ShuffleMapStage =>
      partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
    case s: ResultStage =>
      partitionsToCompute.map { id =>
        val p = s.partitions(id)
        (id, getPreferredLocs(stage.rdd, p))
      }.toMap
  }
} catch {
  case NonFatal(e) =>
    stage.makeNewStageAttempt(partitionsToCompute.size)
    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
    abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
    runningStages -= stage
    return
}
    //为stage创建指定数量的task，最关键一点，task的最佳位置计算算法
        val tasks: Seq[Task[_]] = try {
          val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
          stage match {
            //匹配ShuffleMapStage
            case stage: ShuffleMapStage =>
              stage.pendingPartitions.clear()
              partitionsToCompute.map { id =>
              //为每一个partition创建一个task，给每个task计算最佳位置
                val locs = taskIdToLocations(id)
                val part = partitions(id)
                stage.pendingPartitions += id
                //然后对于ResultStage之外的stage，isShuffleMap都是true，所以会创建ShuffleMapTask
                new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
                  taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
                  Option(sc.applicationId), sc.applicationAttemptId)
              }
		 //匹配ShuffleMapStage
        case stage: ResultStage =>
          partitionsToCompute.map { id =>
            val p: Int = stage.partitions(id)
            val part = partitions(p)
            val locs = taskIdToLocations(id)
            new ResultTask(stage.id, stage.latestInfo.attemptNumber,
              taskBinary, part, locs, id, properties, serializedTaskMetrics,
              Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
          }
      }
    } catch {
      case NonFatal(e) =>
        abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
        runningStages -= stage
        return
    }
         if (tasks.size > 0) {
      logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +
        s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
        //最后，针对stage的task，创建TaskSet对象 ，调用TaskScheduler的submitTasks()方法，提交TaskSet
      taskScheduler.submitTasks(new TaskSet(
        tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
    } else {
      // Because we posted SparkListenerStageSubmitted earlier, we should mark
      // the stage as completed here in case there are no tasks to run
      markStageAsFinished(stage, None)

      val debugString = stage match {
        case stage: ShuffleMapStage =>
          s"Stage ${stage} is actually done; " +
            s"(available: ${stage.isAvailable}," +
            s"available outputs: ${stage.numAvailableOutputs}," +
            s"partitions: ${stage.numPartitions})"
        case stage : ResultStage =>
          s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"
      }
      logDebug(debugString)

      submitWaitingChildStages(stage)
    }
  }

taskIdToLocations里调用getPreferredLocsInternal计算task最挂位置
从stage的最后一个rdd开始，去找哪个rdd的partition，是被cache了，或者checkpoint了，那么，task的最佳位置就是cache/checkpoint的partition位置，因为这个，task就在那个节点上执行，不需要计算之前的rdd了。

   private def getPreferredLocsInternal(
          rdd: RDD[_],
          partition: Int,
          visited: HashSet[(RDD[_], Int)]): Seq[TaskLocation] = {
        // If the partition has already been visited, no need to re-visit.
        // This avoids exponential path exploration.  SPARK-695
        if (!visited.add((rdd, partition))) {
          // Nil has already been returned for previously visited partitions.
          return Nil
        }
        // If the partition is cached, return the cache locations
        //寻找当前rdd的partition是否缓存了
        val cached = getCacheLocs(rdd)(partition)
        if (cached.nonEmpty) {
          return cached
        }
        // If the RDD has some placement preferences (as is the case for input RDDs), get those
        //寻找当前rdd的partition是否缓存了
        val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList
        if (rddPrefs.nonEmpty) {
          return rddPrefs.map(TaskLocation(_))
        }
    
        // If the RDD has narrow dependencies, pick the first partition of the first narrow dependency
        // that has any placement preferences. Ideally we would choose based on transfer sizes,
        // but this will do for now.
        递归调用自己，去寻找rdd的父rdd，看看对应的partition是否缓存或者checkpoint
        rdd.dependencies.foreach {
          case n: NarrowDependency[_] =>
            for (inPart <- n.getParents(partition)) {
              val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
              if (locs != Nil) {
                return locs
              }
            }
    
          case _ =>
        }
    	//如果这个stage从最后一个rdd，到最开始的rdd，partition都没有缓存或者checkpoint，那么，task的最佳位置(preferredLocs)就是Nil，就没有。要通过后面的TaskScheduler去决定
        Nil
      }

caibaoli

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark2.11-2.3源码8_DAGSchedule源码

调用action操作，进入dagScheduler.runJob，接着submitJob，有eventProcessLoop.post(JobSubmitted()，JobSubmitted里dagScheduler.handleJobSubmittedhandleJobSubmitted//DAGScheduler的job调度的核心入口 private[scheduler] de...
复制链接

扫一扫

专栏目录