spark之stage转taskSet及task最佳位置计算

最新推荐文章于 2022-06-25 10:58:49 发布

年青人阿本

最新推荐文章于 2022-06-25 10:58:49 发布

阅读量1k

点赞数

分类专栏： spark

本文链接：https://blog.csdn.net/liben2007/article/details/53192943

版权

spark 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

stage转taskSet及task最佳位置计算这部分源码在DAGScheduler.submitMissingTasks() 函数内
摆上源码：

 /** Called when stage's parents are available and we can now do its task. */
  private def submitMissingTasks(stage: Stage, jobId: Int) {
    logDebug("submitMissingTasks(" + stage + ")")
    // Get our pending tasks and remember them in our pendingTasks entry
    stage.pendingTasks.clear()

    // First figure out the indexes of partition ids to compute.
    // 这里从stage所含所有分区中取出尚未处理的所有分区放到 partitionsToCompute
    val (allPartitions: Seq[Int], partitionsToCompute: Seq[Int]) = {
      stage match {
        case stage: ShuffleMapStage =>
          val allPartitions = 0 until stage.numPartitions
          val filteredPartitions = allPartitions.filter { id => stage.outputLocs(id).isEmpty }
          (allPartitions, filteredPartitions)
        case stage: ResultStage =>
          val job = stage.resultOfJob.get
          val allPartitions = 0 until job.numPartitions
          val filteredPartitions = allPartitions.filter { id => !job.finished(id) }
          (allPartitions, filteredPartitions)
      }
    }

    // Create internal accumulators if the stage has no accumulators initialized.
    // Reset internal accumulators only if this stage is not partially submitted
    // Otherwise, we may override existing accumulator values from some tasks
    if (stage.internalAccumulators.isEmpty || allPartitions == partitionsToCompute) {
      stage.resetInternalAccumulators()
    }

    val properties = jobIdToActiveJob.get(stage.firstJobId).map(_.properties).orNull

    runningStages += stage
    // SparkListenerStageSubmitted should be posted before testing whether tasks are
    // serializable. If tasks are not serializable, a SparkListenerStageCompleted event
    // will be posted, which should always come after a corresponding SparkListenerStageSubmitted
    // event.
    outputCommitCoordinator.stageStart(stage.id)
    //计算task最佳分布
    val taskIdToLocations = try {
      stage match {
        case s: ShuffleMapStage =>
          partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
        case s: ResultStage =>
          val job = s.resultOfJob.get
          partitionsToCompute.map { id =>
            val p = job.partitions(id)
            (id, getPreferredLocs(stage.rdd, p))
          }.toMap
      }
    } catch {
      case NonFatal(e) =>
        stage.makeNewStageAttempt(partitionsToCompute.size)
        listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
        abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))
        runningStages -= stage
        return
    }

    stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))

    // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
    // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
    // the serialized copy of the RDD and for each task we will deserialize it, which means each
    // task gets a different copy of the RDD. This provides stronger isolation between tasks that
    // might modify state of objects referenced in their closures. This is necessary in Hadoop
    // where the JobConf/Configuration object is not thread-safe.
    var taskBinary: Broadcast[Array[Byte]] = null
    try {
      // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
      // For ResultTask, serialize and broadcast (rdd, func).
      val taskBinaryBytes: Array[Byte] = stage match {
        case stage: ShuffleMapStage =>
          closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()
        case stage: ResultStage =>
          closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func): AnyRef).array()
      }

      taskBinary = sc.broadcast(taskBinaryBytes)
    } catch {
      // In the case of a failure during serialization, abort the stage.
      case e: NotSerializableException =>
        abortStage(stage, "Task not serializable: " + e.toString, Some(e))
        runningStages -= stage

        // Abort execution
        return
      case NonFatal(e) =>
        abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}", Some(e))
        runningStages -= stage
        return
    }
    // 封装tasks
    val tasks: Seq[Task[_]] = try {
      stage match {
        case stage: ShuffleMapStage =>
          partitionsToCompute.map { id =>
            val locs = taskIdToLocations(id)   //  取到分区对应的最佳位置
            val part = stage.rdd.partitions(id)  //  取到一个分区，为 Partition 类型
            new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,  // 封装为 ShuffleMapTask
              taskBinary, part, locs, stage.internalAccumulators)
          }

        case stage: ResultStage =>
          val job = stage.resultOfJob.get
          partitionsToCompute.map { id =>
            val p: Int = job.partitions(id)
            val part = stage.rdd.partitions(p)
            val locs = taskIdToLocations(id)
            new ResultTask(stage.id, stage.latestInfo.attemptId,
              taskBinary, part, locs, id, stage.internalAccumulators)
          }
      }
    } catch {
      case NonFatal(e) =>
        abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))
        runningStages -= stage
        return
    }

    if (tasks.size > 0) {
      logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
      stage.pendingTasks ++= tasks
      logDebug("New pending tasks: " + stage.pendingTasks)
      taskScheduler.submitTasks(new TaskSet(
        tasks.toArray, stage.id, stage.latestInfo.attemptId, stage.firstJobId, properties))
      stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
    } else {
      // Because we posted SparkListenerStageSubmitted earlier, we should mark
      // the stage as completed here in case there are no tasks to run
      markStageAsFinished(stage, None)

      val debugString = stage match {
        case stage: ShuffleMapStage =>
          s"Stage ${stage} is actually done; " +
            s"(available: ${stage.isAvailable}," +
            s"available outputs: ${stage.numAvailableOutputs}," +
            s"partitions: ${stage.numPartitions})"
        case stage : ResultStage =>
          s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"
      }
      logDebug(debugString)
    }
  }

因为涉及stage，把stage结构的描述链接放这 http://blog.csdn.net/liben2007/article/details/53208016

stage转taskSet

stage到tasks的转换是从第93行开始，它按stage类型来分，分别转为ShuffleMapTask 或ResultStage 。partitionsToCompute即待计算的所有分析，stage中有多少未计算的分区就形成多少task，task与partition是一一对应的。

stage最佳位置的获取

也分ShuffleMapStage 与ResultStage，其源码位置在上面的第39行开始。taskIdToLocations 是 Map<id,Seq[TaskLocation]>类型

val taskIdToLocations = try {  
     stage match {  
       case s: ShuffleMapStage =>  
         partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap   //调了getPreferredLocs(rdd: RDD[_], partition: Int)这个函数
       case s: ResultStage =>  
         val job = s.resultOfJob.get  
         partitionsToCompute.map { id =>  
           val p = job.partitions(id)  
           (id, getPreferredLocs(stage.rdd, p))  
         }.toMap  
     }

可以看到主要是调了getPreferredLocs(rdd: RDD[_], partition: Int)这个函数，下面贴出代码重要部分

DAGScheduler

// 返回的是 Seq[TaskLocation], TaskLocation 内仅一个变量: host
def getPreferredLocs(rdd: RDD[_], partition: Int): Seq[TaskLocation] = {
    getPreferredLocsInternal(rdd, partition, new HashSet)  
}

getPreferredLocsInternal(rdd: RDD[_], partition: Int, visited: HashSet[(RDD[_], Int)]): Seq[TaskLocation]
{
    // rdd 有缓存的话，直接获取缓存的位置信息
    val cached = getCacheLocs(rdd)(partition)
        if (cached.nonEmpty) {
            return cached
    }

    // 调rdd.preferredLocations => getPreferredLocations() 方法,对于那种直接从外部数据源（比如hdfs）形成的rdd，这里就直接返回数据所在的地址了。
    // 但是占大多数的 MapPartitionsRDD 未实现 getPreferredLocations() ，所以关键还在后面的递归
    val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList
        if (rddPrefs.nonEmpty) {
            return rddPrefs.map(TaskLocation(_))
    }

    // ***** 如果RDD是窄依赖，则递归查找窄依赖链条上的第一个RDD的第一个Partition的数据Location作为locs 
    rdd.dependencies.foreach {
        case n: NarrowDependency[_] =>
            for (inPart <- n.getParents(partition)) {
                val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
                if (locs != Nil) {
                    return locs
                }
            }
        case _ =>
    }
}

从上面看到，在无缓存时，主要是根据rdd自己实现的 getPreferredLocations（）方法来得到最佳位置。那rdd的 getPreferredLocations() 这个方法一般是怎么做的呢？

以KafkaRDD为例：

KafkaRDD 
{
	override def getPreferredLocations(thePart: Partition): Seq[String] = {
		// 获取kafka的partition的host，即数据源所在位置
		val part = thePart.asInstanceOf[KafkaRDDPartition]
		Seq(part.host) 
	}
}

它直接获取kafka topic分区位置。我们常用的 sc.textFile() 方法返回的HadoopRDD 也实现了getPreferredLocations() , 可以猜到它实现的也是获取hdfs block位置。

我们使用transtfer算子基本形成的 MapPartitionsRDD类型未实现 getPreferredLocations() 方法。那么在 getPreferredLocsInternal() 方法输入参数是这种rdd，那就会递归直到获得缓存了的上辈 rdd 或者递归到源rdd（KafkaRDD 获取 HadoopRDD）这种实现了getPreferredLocsInternal() 方法的，来获取它的分区最佳位置。（对于缓存情况，后续还要做分析了解）

总结：

- 多数情况下 stage 最后一个rdd（即getPreferredLocs的参数）即未缓存，又是 MapPartitionsRDD 类型rdd。
- 那么 getPreferredLocsInternal 就会递归追溯到其上辈中实现了 getPreferredLocations()函数的那一辈，来获取其最佳位置作为其最佳位置。
- 那么一般情况下(中间有缓存操作除外)，就是以源rdd的分区最佳位置为其最佳位置了，也就是数据所在位置（因为源rdd的分区最佳位置一般就取数据位置，如kafkaRdd, hadoopRdd）

以上就是task对应最佳位置如何计算得到。总结就是不断前推得到祖宗源的最佳位置。之后 taskScheduler.submitTasks() 时怎么使用这些最佳位置待分析。