stage转taskSet及task最佳位置计算这部分源码在DAGScheduler.submitMissingTasks() 函数内
摆上源码:
/** Called when stage's parents are available and we can now do its task. */
private def submitMissingTasks(stage: Stage, jobId: Int) {
logDebug("submitMissingTasks(" + stage + ")")
// Get our pending tasks and remember them in our pendingTasks entry
stage.pendingTasks.clear()
// First figure out the indexes of partition ids to compute.
// 这里从stage所含所有分区中取出尚未处理的所有分区放到 partitionsToCompute
val (allPartitions: Seq[Int], partitionsToCompute: Seq[Int]) = {
stage match {
case stage: ShuffleMapStage =>
val allPartitions = 0 until stage.numPartitions
val filteredPartitions = allPartitions.filter { id => stage.outputLocs(id).isEmpty }
(allPartitions, filteredPartitions)
case stage: ResultStage =>
val job = stage.resultOfJob.get
val allPartitions = 0 until job.numPartitions
val filteredPartitions = allPartitions.filter { id => !job.finished(id) }
(allPartitions, filteredPartitions)
}
}
// Create internal accumulators if the stage has no accumulators initialized.
// Reset internal accumulators only if this stage is not partially submitted
// Otherwise, we may override existing accumulator values from some tasks
if (stage.internalAccumulators.isEmpty || allPartitions == partitionsToCompute) {
stage.resetInternalAccumulators()
}
val properties = jobIdToActiveJob.get(stage.firstJobId).map(_.properties).orNull
runningStages += stage
// SparkListenerStageSubmitted should be posted before testing whether tasks are
// serializable. If tasks are not serializable, a SparkListenerStageCompleted event
// will be posted, which should always come after a corresponding SparkListenerStageSubmitted
// event.
outputCommitCoordinator.stageStart(stage.id)
//计算task最佳分布
val taskIdToLocations = try {
stage match {
case s: ShuffleMapStage =>
partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
case s: ResultStage =>
val job = s.resultOfJob.get
partitionsToCompute.map { id =>
val p = job.partitions(id)
(id, getPreferredLocs(stage.rdd, p))
}.toMap
}
} catch {
case NonFatal(e) =>
stage.makeNewStageAttempt(partitionsToCompute.size)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))
runningStages -= stage
return
}
stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
// the serialized copy of the RDD and for each task we will deserialize it, which means each
// task gets a different copy of the RDD. This provides stronger isolation between tasks that
// might modify state of objects referenced in their closures. This is necessary in Hadoop
// where the JobConf/Configuration object is not thread-safe.
var taskBinary: Broadcast[Array[Byte]] = null
try {
// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
// For ResultTask, serialize and broadcast (rdd, func).
val taskBinaryBytes: Array[Byte] = stage match {
case stage: ShuffleMapStage =>
closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()
case stage: ResultStage =>
closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func): AnyRef).array()
}
taskBinary = sc.broadcast(taskBinaryBytes)
} catch {
// In the case of a failure during serialization, abort the stage.
case e: NotSerializableException =>
abortStage(stage, "Task not serializable: " + e.toString, Some(e))
runningStages -= stage
// Abort execution
return
case NonFatal(e) =>
abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}", Some(e))
runningStages -= stage
return
}
// 封装tasks
val tasks: Seq[Task[_]] = try {
stage match {
case stage: ShuffleMapStage =>
partitionsToCompute.map { id =>
val locs = taskIdToLocations(id) // 取到分区对应的最佳位置
val part = stage.rdd.partitions(id) // 取到一个分区,为 Partition 类型
new ShuffleMapTask(stage.id, stage.latestInfo.attemptId, // 封装为 ShuffleMapTask
taskBinary, part, locs, stage.internalAccumulators)
}
case stage: ResultStage =>
val job = stage.resultOfJob.get
partitionsToCompute.map { id =>
val p: Int = job.partitions(id)
val part = stage.rdd.partitions(p)
val locs = taskIdToLocations(id)
new ResultTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, id, stage.internalAccumulators)
}
}
} catch {
case NonFatal(e) =>
abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))
runningStages -= stage
return
}
if (tasks.size > 0) {
logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
stage.pendingTasks ++= tasks
logDebug("New pending tasks: " + stage.pendingTasks)
taskScheduler.submitTasks(new TaskSet(
tasks.toArray, stage.id, stage.latestInfo.attemptId, stage.firstJobId, properties))
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
} else {
// Because we posted SparkListenerStageSubmitted earlier, we should mark
// the stage as completed here in case there are no tasks to run
markStageAsFinished(stage, None)
val debugString = stage match {
case stage: ShuffleMapStage =>
s"Stage ${stage} is actually done; " +
s"(available: ${stage.isAvailable}," +
s"available outputs: ${stage.numAvailableOutputs}," +
s"partitions: ${stage.numPartitions})"
case stage : ResultStage =>
s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"
}
logDebug(debugString)
}
}
因为涉及stage,把stage结构的描述链接放这 http://blog.csdn.net/liben2007/article/details/53208016
stage转taskSet
stage到tasks的转换是从 第93行开始,它按stage类型来分,分别转为ShuffleMapTask 或ResultStage 。partitionsToCompute即待计算的所有分析,stage中有多少未计算的分区就形成多少task,task与partition是一 一对应的。
stage最佳位置的获取
也分ShuffleMapStage 与ResultStage,其源码位置在上面的第39行开始。taskIdToLocations 是 Map<id,Seq[TaskLocation]>类型
val taskIdToLocations = try {
stage match {
case s: ShuffleMapStage =>
partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap //调了getPreferredLocs(rdd: RDD[_], partition: Int)这个函数
case s: ResultStage =>
val job = s.resultOfJob.get
partitionsToCompute.map { id =>
val p = job.partitions(id)
(id, getPreferredLocs(stage.rdd, p))
}.toMap
}
可以看到主要是调了getPreferredLocs(rdd: RDD[_], partition: Int)这个函数,下面贴出代码重要部分
DAGScheduler
// 返回的是 Seq[TaskLocation], TaskLocation 内仅一个变量: host
def getPreferredLocs(rdd: RDD[_], partition: Int): Seq[TaskLocation] = {
getPreferredLocsInternal(rdd, partition, new HashSet)
}
getPreferredLocsInternal(rdd: RDD[_], partition: Int, visited: HashSet[(RDD[_], Int)]): Seq[TaskLocation]
{
// rdd 有缓存的话,直接获取缓存的位置信息
val cached = getCacheLocs(rdd)(partition)
if (cached.nonEmpty) {
return cached
}
// 调rdd.preferredLocations => getPreferredLocations() 方法,对于那种直接从外部数据源(比如hdfs)形成的rdd,这里就直接返回数据所在的地址了。
// 但是占大多数的 MapPartitionsRDD 未实现 getPreferredLocations() ,所以关键还在后面的递归
val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList
if (rddPrefs.nonEmpty) {
return rddPrefs.map(TaskLocation(_))
}
// ***** 如果RDD是窄依赖,则递归查找窄依赖链条上的第一个RDD的第一个Partition的数据Location作为locs
rdd.dependencies.foreach {
case n: NarrowDependency[_] =>
for (inPart <- n.getParents(partition)) {
val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
if (locs != Nil) {
return locs
}
}
case _ =>
}
}
从上面看到,在无缓存时,主要是根据rdd自己实现的 getPreferredLocations()方法来得到最佳位置。那rdd的 getPreferredLocations() 这个方法一般是怎么做的呢?
以KafkaRDD为例:
KafkaRDD
{
override def getPreferredLocations(thePart: Partition): Seq[String] = {
// 获取kafka的partition的host,即数据源所在位置
val part = thePart.asInstanceOf[KafkaRDDPartition]
Seq(part.host)
}
}
它直接获取kafka topic分区位置。我们常用的 sc.textFile() 方法返回的HadoopRDD 也实现了getPreferredLocations() , 可以猜到它实现的也是获取hdfs block位置。
我们使用transtfer算子基本形成的 MapPartitionsRDD类型未实现 getPreferredLocations() 方法。那么在 getPreferredLocsInternal() 方法输入参数是这种rdd,那就会递归直到获得缓存了的上辈 rdd 或者 递归到 源rdd(KafkaRDD 获取 HadoopRDD)这种实现了getPreferredLocsInternal() 方法的,来获取它的分区最佳位置。(对于缓存情况,后续还要做分析了解)
总结:
- 那么 getPreferredLocsInternal 就会递归追溯到其上辈中实现了 getPreferredLocations()函数的那一辈,来获取其最佳位置作为其最佳位置 。
- 那么一般情况下(中间有缓存操作除外),就是以源rdd的分区最佳位置为其最佳位置了,也就是数据所在位置(因为源rdd的分区最佳位置一般就取数据位置,如kafkaRdd, hadoopRdd)
以上就是task对应最佳位置如何计算得到。总结就是不断前推得到祖宗源的最佳位置。之后 taskScheduler.submitTasks() 时怎么使用这些最佳位置待分析。