spark1.2.0源码分析之ShuffledRDD的Stage划分

当调用reduceByKey()、groupByKey()等操作后,会伴随着ShuffledRDD的生成,具体源码如下:

  def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = {
    combineByKey[V]((v: V) => v, func, func, partitioner)
  }
继续:

 def combineByKey[C](createCombiner: V => C,
      mergeValue: (C, V) => C,
      mergeCombiners: (C, C) => C,
      partitioner: Partitioner,
      mapSideCombine: Boolean = true,  //默认需要shuffle前(mapper端)的每个分区按key进行聚合,可以提升性能
      serializer: Serializer = null): RDD[(K, C)] = {
    require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
    if (keyClass.isArray) {   // key是数组时,不能再mapper端聚合,同时如果是 HashPartitioner,也不支持数组分区
      if (mapSideCombine) {
        throw new SparkException("Cannot use map-side combining with array keys.")
      }
      if (partitioner.isInstanceOf[HashPartitioner]) {
        throw new SparkException("Default partitioner cannot partition array keys.")
      }
    }
    val aggregator = new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners)
    if (self.partitioner == Some(partitioner)) {   //若是指定分区和原来一样(已经进行了shuffle操作),说明已经按key分好区了
      self.mapPartitions(iter => {
        val context = TaskContext.get()
        new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context)) //此时,直接合并value就行,iter就是每个分区计算的结果
      }, preservesPartitioning = true)
    } else {
      new ShuffledRDD[K, V, C](self, partitioner)  //否则,按照指定partitioner,新生成一个ShuffledRDD
        .setSerializer(serializer)
        .setAggregator(aggregator)
        .setMapSideCombine(mapSideCombine)  //默认先在mapper端聚合
    }
  }

ShuffledRDD中关键的代码为:

 override def getDependencies: Seq[Dependency[_]] = {
    List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))  //rdd的依赖为ShuffleDependency
  }

 override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
    val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
    SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
      .read()     // 每次reducer都读一个partition
      .asInstanceOf[Iterator[(K, C)]]
  }

当有action导致runJob时,会调用DAGScheduler里的runJob,具体调用过程为:runJob() ------ submitJob() -------- eventProcessActor ! JobSubmitted() ----- dagScheduler.handleJobSubmitted() ,其中最关键的代码为(只关注和shuffledRDD有关的操作):

finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)  //划分stage

继续查看newStage()的具体实现:

  private def newStage(
      rdd: RDD[_],
      numTasks: Int,
      shuffleDep: Option[ShuffleDependency[_, _, _]],
      jobId: Int,
      callSite: CallSite)
    : Stage =
  {
    val parentStages = getParentStages(rdd, jobId)  //获取父stages
    val id = nextStageId.getAndIncrement()  //获取stageId
    val stage = new Stage(id, rdd, numTasks, shuffleDep, parentStages, jobId, callSite)  //这个为finalRDD的stage,其shuffleDep为None
    stageIdToStage(id) = stage
    updateJobIdStageIdMaps(jobId, stage)
    stage
  }
查看 getParentStages()的具体实现:

  private def getParentStages(rdd: RDD[_], jobId: Int): List[Stage] = {
    val parents = new HashSet[Stage]
    val visited = new HashSet[RDD[_]]
    // We are manually maintaining a stack here to prevent StackOverflowError
    // caused by recursively visiting
    val waitingForVisit = new Stack[RDD[_]]
    def visit(r: RDD[_]) {
      if (!visited(r)) {    //标记是否之前有访问过该rdd
        visited += r
        // Kind of ugly: need to register RDDs with the cache here since
        // we can't do it in its constructor because # of partitions is unknown
        for (dep <- r.dependencies) {
          dep match {
            case shufDep: ShuffleDependency[_, _, _] =>
              parents += getShuffleMapStage(shufDep, jobId)  //宽依赖时(shuffle),stage划分边界
            case _ =>
              waitingForVisit.push(dep.rdd)   //窄依赖时,将其依赖的父rdd继续放入stack中,不进行stage划分
          }
        }
      }
    }
    waitingForVisit.push(rdd)   //将rdd入栈
    while (!waitingForVisit.isEmpty) {
      visit(waitingForVisit.pop())  //pop出栈,进入visit函数处理
    }
    parents.toList
  }

可知,当依赖为 ShuffleDependency 时,就是Stage划分的边界。

继续查看 getShuffleMapStage()的具体实现:

  private def getShuffleMapStage(shuffleDep: ShuffleDependency[_, _, _], jobId: Int): Stage = {
    shuffleToMapStage.get(shuffleDep.shuffleId) match {
      case Some(stage) => stage
      case None =>
        // We are going to register ancestor shuffle dependencies
        registerShuffleDependencies(shuffleDep, jobId)   //第一次执行时,进入该方法,获取shuffle依赖的父shuffle依赖们
        // Then register current shuffleDep
        val stage =
          newOrUsedStage(
            shuffleDep.rdd, shuffleDep.rdd.partitions.size, shuffleDep, jobId,
            shuffleDep.rdd.creationSite)   //这个为finalRDD stage的父stage
        shuffleToMapStage(shuffleDep.shuffleId) = stage  //将其加入HashMap中
 
        stage
    }
  }

继续查看 registerShuffleDependencies() 方法:

  private def registerShuffleDependencies(shuffleDep: ShuffleDependency[_, _, _], jobId: Int) = {
    val parentsWithNoMapStage = getAncestorShuffleDependencies(shuffleDep.rdd)  //获取shuffle依赖的父shuffle依赖们
    while (!parentsWithNoMapStage.isEmpty) {
      val currentShufDep = parentsWithNoMapStage.pop()    //依次取出栈中的shuffle依赖们
      val stage =
        newOrUsedStage(     
          currentShufDep.rdd, currentShufDep.rdd.partitions.size, currentShufDep, jobId,
          currentShufDep.rdd.creationSite)
      shuffleToMapStage(currentShufDep.shuffleId) = stage   //将它们的stage加入HashMap中
    }
  }

继续查看 getAncestorShuffleDependencies() 方法:

 private def getAncestorShuffleDependencies(rdd: RDD[_]): Stack[ShuffleDependency[_, _, _]] = {
    val parents = new Stack[ShuffleDependency[_, _, _]]
    val visited = new HashSet[RDD[_]]
    // We are manually maintaining a stack here to prevent StackOverflowError
    // caused by recursively visiting
    val waitingForVisit = new Stack[RDD[_]]
    def visit(r: RDD[_]) {
      if (!visited(r)) {
        visited += r
        for (dep <- r.dependencies) {
          dep match {
            case shufDep: ShuffleDependency[_, _, _] =>
              if (!shuffleToMapStage.contains(shufDep.shuffleId)) {
                parents.push(shufDep)   //如果有shuffle依赖,则放入堆栈,如果有多个,也都压入栈
              }

              waitingForVisit.push(shufDep.rdd)  //继续往前回溯
            case _ =>
              waitingForVisit.push(dep.rdd)
          }
        }
      }
    }

    waitingForVisit.push(rdd)
    while (!waitingForVisit.isEmpty) {
      visit(waitingForVisit.pop())
    }
    parents
  }

此时,已经在 shuffleToMapStage中加入了所有stage的相关信息。

接着,再回到之前的 finalStage 构建的阶段,完整源码如下:


  private[scheduler] def handleJobSubmitted(jobId: Int,
      finalRDD: RDD[_],
      func: (TaskContext, Iterator[_]) => _,
      partitions: Array[Int],
      allowLocal: Boolean,
      callSite: CallSite,
      listener: JobListener,
      properties: Properties = null)
  {
    var finalStage: Stage = null
    try {
      // New stage creation may throw an exception if, for example, jobs are run on a
      // HadoopRDD whose underlying HDFS files have been deleted.
      finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)
    } catch {
      case e: Exception =>
        logWarning("Creating new stage failed due to exception - job: " + jobId, e)
        listener.jobFailed(e)
        return
    }
    if (finalStage != null) {
      val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
      clearCacheLocs()
      logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format(
        job.jobId, callSite.shortForm, partitions.length, allowLocal))
      logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
      logInfo("Parents of final stage: " + finalStage.parents)
      logInfo("Missing parents: " + getMissingParentStages(finalStage))
      val shouldRunLocally =
        localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1
      if (shouldRunLocally) {   //是否在本地运行
        // Compute very short actions like first() or take() with no parent stages locally.
        listenerBus.post(SparkListenerJobStart(job.jobId, Seq.empty, properties))
        runLocally(job)
      } else {
        jobIdToActiveJob(jobId) = job
        activeJobs += job
        finalStage.resultOfJob = Some(job)
        val stageIds = jobIdToStageIds(jobId).toArray
        val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
        listenerBus.post(SparkListenerJobStart(job.jobId, stageInfos, properties))
        submitStage(finalStage)   // 提交finalStage
      }
    }
    submitWaitingStages()
  }

接下来的过程就比较清晰了,当提交finalStage时,会调用:

  private def submitStage(stage: Stage) {
    val jobId = activeJobForStage(stage) 
    if (jobId.isDefined) {
      logDebug("submitStage(" + stage + ")")
      if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
        val missing = getMissingParentStages(stage).sortBy(_.id)  //查看是否有父stage
        logDebug("missing: " + missing)
        if (missing == Nil) { //说明已经没有依赖的父stage,首先提交该stage(祖宗stage)
          logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
          submitMissingTasks(stage, jobId.get)
        } else {
          for (parent <- missing) { //如果还有父stage,继续递归
            submitStage(parent)
          }
          waitingStages += stage
        }
      }
    } else {
      abortStage(stage, "No active job for stage " + stage.id)
    }
  }

最后,会调用 submitMissingTasks() 中的如下代码 :

      taskScheduler.submitTasks(
        new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties)) 

以TaskSet的方式将其提交给底层的调度器 TaskScheduler 进行处理。


  *************  The End   *************





  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值