Stage的创建

1.简介:任务提交调用:

private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
    case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
      dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)

handleJobSubmitted方法。

 

2.createResultStage

 private[scheduler] def handleJobSubmitted(jobId: Int,
      finalRDD: RDD[_],
      func: (TaskContext, Iterator[_]) => _,
      partitions: Array[Int],
      callSite: CallSite,
      listener: JobListener,
      properties: Properties) {
    var finalStage: ResultStage = null
    try {
      //例如,如果在已删除其基础HDFS文件的HadoopRDD上运行作业,则新阶段创建可能会引发异常
      finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
    }

点进createResultStage

  private def createResultStage(
      rdd: RDD[_],
      func: (TaskContext, Iterator[_]) => _,
      partitions: Array[Int],
      jobId: Int,
      callSite: CallSite): ResultStage = {
    checkBarrierStageWithDynamicAllocation(rdd)
    checkBarrierStageWithNumSlots(rdd)
    checkBarrierStageWithRDDChainPattern(rdd, partitions.toSet.size)
    //parents将会指向下一个stage序列
    val parents = getOrCreateParentStages(rdd, jobId)
    val id = nextStageId.getAndIncrement()
    val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
    stageIdToStage(id) = stage
    updateJobIdStageIdMaps(jobId, stage)
    stage
  }

点进getOrCreateParentStages

  private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
    getShuffleDependencies(rdd).map { shuffleDep =>
      getOrCreateShuffleMapStage(shuffleDep, firstJobId)
    }.toList
  }

点进getShuffleDependencies

    //从后往前寻找ShuffleDependencies,找不到就直到最开始的hadoopRDD停止
    //因为hadoopRDD的getDependencies是Nil,waitingForVisit就是空了
    private[scheduler] def getShuffleDependencies(
      rdd: RDD[_]): HashSet[ShuffleDependency[_, _, _]] = {
    val parents = new HashSet[ShuffleDependency[_, _, _]]
    val visited = new HashSet[RDD[_]]
    val waitingForVisit = new ListBuffer[RDD[_]]
    waitingForVisit += rdd
    //判断是否为空
    while (waitingForVisit.nonEmpty) {
      //弹出第1个元素,有点像pop
      val toVisit = waitingForVisit.remove(0)
      if (!visited(toVisit)) {
        visited += toVisit
        //dependencies返回rdd内的Dependencies,
        toVisit.dependencies.foreach {
          //模式匹配
          case shuffleDep: ShuffleDependency[_, _, _] =>
            //如果是,结束循环
            parents += shuffleDep
          case dependency => 
            //通配模式,向ListBuffer中加入序列,继续循环
            waitingForVisit.prepend(dependency.rdd)
        }
      }
    }
    //返回HashSet
    parents
  }

获得parents后用map对获取的ShuffleDependency使用getOrCreateShuffleMapStage方法,如果有的话:

  private def getOrCreateShuffleMapStage(
      shuffleDep: ShuffleDependency[_, _, _],
      firstJobId: Int): ShuffleMapStage = {
    //如果stage已经put进shuffleIdToMapStage,
    //可能是考虑下面看起来啥也没干的foreach方法中create了Stage然后put进shuffleIdToMapStage吧
    shuffleIdToMapStage.get(shuffleDep.shuffleId) match {
      case Some(stage) =>
        stage

      case None =>
        //为所有缺少的祖先shuffle依赖项创建阶段
        getMissingAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>
          //尽管getMissingAncestorShuffleDependencies仅返回shuffleIdToMapStage中尚未存在的shuffle依赖项,
          //但是当我们到达foreach循环中的特定依赖项时,
          //它可能已经通过阶段创建过程添加到shuffleIdToMapStage中以用于早期依赖项.
          // 有关更多信息,请参见SPARK-13902.
          if (!shuffleIdToMapStage.contains(dep.shuffleId)) {
            createShuffleMapStage(dep, firstJobId)
          }
        }

        //最后,为给定的shuffle依赖创建一个阶段
        //这才是最后返回的,上面的foreach方法并不返回任何东西,可能是设置验证一些参数吧,不影响逻辑结果
        createShuffleMapStage(shuffleDep, firstJobId)
    }
  }

接着点进createShuffleMapStage

 def createShuffleMapStage[K, V, C](
      shuffleDep: ShuffleDependency[K, V, C], jobId: Int): ShuffleMapStage = {
    val rdd = shuffleDep.rdd
    checkBarrierStageWithDynamicAllocation(rdd)
    checkBarrierStageWithNumSlots(rdd)
    checkBarrierStageWithRDDChainPattern(rdd, rdd.getNumPartitions)
    val numTasks = rdd.partitions.length
    //又是一个新的开始,递归创建parents
    val parents = getOrCreateParentStages(rdd, jobId)
    val id = nextStageId.getAndIncrement()
    //创建Stage
    val stage = new ShuffleMapStage(
      id, rdd, numTasks, parents, jobId, rdd.creationSite, shuffleDep, mapOutputTracker)

    stageIdToStage(id) = stage
    shuffleIdToMapStage(shuffleDep.shuffleId) = stage
    updateJobIdStageIdMaps(jobId, stage)

    if (!mapOutputTracker.containsShuffle(shuffleDep.shuffleId)) {
      //有点难看:需要在这里注册RDD与缓存和映射输出跟踪器,因为我们无法在RDD构造函数中执行它,因为分区的#是未知的
      logInfo("Registering RDD " + rdd.id + " (" + rdd.getCreationSite + ")")
      mapOutputTracker.registerShuffle(shuffleDep.shuffleId, rdd.partitions.length)
    }
    //返回Stage
    stage
  }

点进ShuffleMapStage

private[spark] class ShuffleMapStage(
    id: Int,
    rdd: RDD[_],
    numTasks: Int,
    parents: List[Stage],
    firstJobId: Int,
    callSite: CallSite,
    val shuffleDep: ShuffleDependency[_, _, _],
    mapOutputTrackerMaster: MapOutputTrackerMaster)
  extends Stage(id, rdd, numTasks, parents, firstJobId, callSite) 

点进Stage

private var _latestInfo: StageInfo = StageInfo.fromStage(this, nextAttemptId)

点进formStage

  def fromStage(
      stage: Stage,
      attemptId: Int,
      numTasks: Option[Int] = None,
      taskMetrics: TaskMetrics = null,
      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
    ): StageInfo = {
    //获取依赖中的rdd信息
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    //将this.rdd加上
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    val shuffleDepId = stage match {
      case sms: ShuffleMapStage => Option(sms.shuffleDep).map(_.shuffleId)
      case _ => None
    }
    new StageInfo(
      stage.id,
      attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.parents.map(_.id),
      stage.details,
      taskMetrics,
      taskLocalityPreferences,
      shuffleDepId)
  }

点进getNarrowAncestors

 private[spark] def getNarrowAncestors: Seq[RDD[_]] = {
    val ancestors = new mutable.HashSet[RDD[_]]

    def visit(rdd: RDD[_]): Unit = {
      //如果是NarrowDependency
      val narrowDependencies = rdd.dependencies.filter(_.isInstanceOf[NarrowDependency[_]])
      //注意,这里转换成依赖中的rdd,不要误看错,以为在NarrowDependency和ShuffleDependency交界处的rdd没加入
      val narrowParents = narrowDependencies.map(_.rdd)
      val narrowParentsNotVisited = narrowParents.filterNot(ancestors.contains)
      narrowParentsNotVisited.foreach { parent =>
        ancestors.add(parent)
        //递归,当rdd中的dependencies不是NarrowDependency时停止
        visit(parent)
      }
    }

    visit(this)

    //如果存在循环,请不要包含根本身
    ancestors.filterNot(_ == this).toSeq
  }

到此为止Stage创建基本结束。

 

3.总结:ShuffleDependency越多,Stage越多,任务越复杂。NarrowDependency中的rdd任务被保存在StageInfo中,可以把NarrowDependency任务放在一块儿来减少Stage的数量。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值