job提交大体执行流程图:
SparkContext : scheduler创建
- // Create and start the scheduler
- val (sched, ts) = SparkContext.createTaskScheduler(this, master)
- _schedulerBackend = sched
- _taskScheduler = ts
- _dagScheduler = new DAGScheduler(this)
- _heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet)
- private def createTaskScheduler(
- sc: SparkContext,
- master: String): (SchedulerBackend, TaskScheduler) = {
- // Regular expression used for local[N] and local[*] master formats
- val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r
- // Regular expression for local[N, maxRetries], used in tests with failing tasks
- val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+|\*)\s*,\s*([0-9]+)\]""".r
- // Regular expression for simulating a Spark cluster of [N, cores, memory] locally
- val LOCAL_CLUSTER_REGEX = """local-cluster\[\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*]""".r
- // Regular expression for connecting to Spark deploy clusters
- val SPARK_REGEX = """spark://(.*)""".r
- // Regular expression for connection to Mesos cluster by mesos:// or zk:// url
- val MESOS_REGEX = """(mesos|zk)://.*""".r
- // Regular expression for connection to Simr cluster
- val SIMR_REGEX = """simr://(.*)""".r
-
- // When running locally, don't try to re-execute tasks on failure.
- val MAX_LOCAL_TASK_FAILURES = 1
-
- master match {
- case "local" =>
- val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
- val backend = new LocalBackend(sc.getConf, scheduler, 1)
- scheduler.initialize(backend)
- (backend, scheduler)
-
- case LOCAL_N_REGEX(threads) =>
- def localCpuCount: Int = Runtime.getRuntime.availableProcessors()
- // local[*] estimates the number of cores on the machine; local[N] uses exactly N threads.
- val threadCount = if (threads == "*") localCpuCount else threads.toInt
- if (threadCount <= 0) {
- throw new SparkException(s"Asked to run locally with $threadCount threads")
- }
- val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
- val backend = new LocalBackend(sc.getConf, scheduler, threadCount)
- scheduler.initialize(backend)
- (backend, scheduler)
-
- case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>
- def localCpuCount: Int = Runtime.getRuntime.availableProcessors()
- // local[*, M] means the number of cores on the computer with M failures
- // local[N, M] means exactly N threads with M failures
- val threadCount = if (threads == "*") localCpuCount else threads.toInt
- val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true)
- val backend = new LocalBackend(sc.getConf, scheduler, threadCount)
- scheduler.initialize(backend)
- (backend, scheduler)
-
- case SPARK_REGEX(sparkUrl) =>
- val scheduler = new TaskSchedulerImpl(sc)
- val masterUrls = sparkUrl.split(",").map("spark://" + _)
- val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls)
- scheduler.initialize(backend)
- (backend, scheduler)
-
- case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>
- // Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.
- val memoryPerSlaveInt = memoryPerSlave.toInt
- if (sc.executorMemory > memoryPerSlaveInt) {
- throw new SparkException(
- "Asked to launch cluster with %d MB RAM / worker but requested %d MB/worker".format(
- memoryPerSlaveInt, sc.executorMemory))
- }
-
- val scheduler = new TaskSchedulerImpl(sc)
- val localCluster = new LocalSparkCluster(
- numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt, sc.conf)
- val masterUrls = localCluster.start()
- val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls)
- scheduler.initialize(backend)
- backend.shutdownCallback = (backend: SparkDeploySchedulerBackend) => {
- localCluster.stop()
- }
- (backend, scheduler)
-
- case "yarn-standalone" | "yarn-cluster" =>
- if (master == "yarn-standalone") {
- logWarning(
- "\"yarn-standalone\" is deprecated as of Spark 1.0. Use \"yarn-cluster\" instead.")
- }
- val scheduler = try {
- val clazz = Utils.classForName("org.apache.spark.scheduler.cluster.YarnClusterScheduler")
- val cons = clazz.getConstructor(classOf[SparkContext])
- cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl]
- } catch {
- // TODO: Enumerate the exact reasons why it can fail
- // But irrespective of it, it means we cannot proceed !
- case e: Exception => {
- throw new SparkException("YARN mode not available ?", e)
- }
- }
- val backend = try {
- val clazz =
- Utils.classForName("org.apache.spark.scheduler.cluster.YarnClusterSchedulerBackend")
- val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext])
- cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend]
- } catch {
- case e: Exception => {
- throw new SparkException("YARN mode not available ?", e)
- }
- }
- scheduler.initialize(backend)
- (backend, scheduler)
-
- case "yarn-client" =>
- val scheduler = try {
- val clazz = Utils.classForName("org.apache.spark.scheduler.cluster.YarnScheduler")
- val cons = clazz.getConstructor(classOf[SparkContext])
- cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl]
-
- } catch {
- case e: Exception => {
- throw new SparkException("YARN mode not available ?", e)
- }
- }
-
- val backend = try {
- val clazz =
- Utils.classForName("org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend")
- val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext])
- cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend]
- } catch {
- case e: Exception => {
- throw new SparkException("YARN mode not available ?", e)
- }
- }
-
- scheduler.initialize(backend)
- (backend, scheduler)
-
- case mesosUrl @ MESOS_REGEX(_) =>
- MesosNativeLibrary.load()
- val scheduler = new TaskSchedulerImpl(sc)
- val coarseGrained = sc.conf.getBoolean("spark.mesos.coarse", false)
- val url = mesosUrl.stripPrefix("mesos://") // strip scheme from raw Mesos URLs
- val backend = if (coarseGrained) {
- new CoarseMesosSchedulerBackend(scheduler, sc, url, sc.env.securityManager)
- } else {
- new MesosSchedulerBackend(scheduler, sc, url)
- }
- scheduler.initialize(backend)
- (backend, scheduler)
-
- case SIMR_REGEX(simrUrl) =>
- val scheduler = new TaskSchedulerImpl(sc)
- val backend = new SimrSchedulerBackend(scheduler, sc, simrUrl)
- scheduler.initialize(backend)
- (backend, scheduler)
-
- case _ =>
- throw new SparkException("Could not parse Master URL: '" + master + "'")
- }
- }
- }
RDD的Action操作会调用sc的runJob方法
1.SparkContext的runJob方法:
- def runJob[T, U: ClassTag](
- rdd: RDD[T], //此处是具体的 RDD 实例值
-
- func: (TaskContext, Iterator[T]) => U, //具体的执行的 action 的逻辑 , 如 reduceByKey
-
- partitions: Seq[Int], //分区数组 , 一个数值从 0 到 partitions.size-1
-
- //result 的处理逻辑 , 每一个 Task 的处理
-
- resultHandler: (Int, U) => Unit): Unit = {
- if (stopped.get()) {
- throw new IllegalStateException("SparkContext has been shutdown")
- }
- val callSite = getCallSite
- val cleanedFunc = clean(func)
- logInfo("Starting job: " + callSite.shortForm)
- if (conf.getBoolean("spark.logLineage", false)) {
- logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
- }
- dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
- progressBar.foreach(_.finishAll())
- rdd.doCheckpoint()
- }
2.DAGScheduler的runjob方法:
- def runJob[T, U](
- rdd: RDD[T],
- func: (TaskContext, Iterator[T]) => U,
- partitions: Seq[Int],
- callSite: CallSite,
- resultHandler: (Int, U) => Unit,
- properties: Properties): Unit = {
- val start = System.nanoTime
- val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
- //等待 job 运行完成。
-
- waiter.awaitResult() match {
- case JobSucceeded =>
- logInfo("Job %d finished: %s, took %f s".format
- (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
- case JobFailed(exception: Exception) =>
- logInfo("Job %d failed: %s, took %f s".format
- (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
- // SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
-
- val callerStackTrace = Thread.currentThread().getStackTrace.tail
- exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
- throw exception
- }
- }
- def submitJob[T, U](
- rdd: RDD[T],
- func: (TaskContext, Iterator[T]) => U,
- partitions: Seq[Int],
- callSite: CallSite,
- resultHandler: (Int, U) => Unit,
- properties: Properties): JobWaiter[U] = {
- // Check to make sure we are not launching a task on a partition that does not exist.
-
- val maxPartitions = rdd.partitions.length
- partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
- throw new IllegalArgumentException(
- "Attempting to access a non-existent partition: " + p + ". " +
- "Total number of partitions: " + maxPartitions)
- }
-
- val jobId = nextJobId.getAndIncrement()
- if (partitions.size == 0) {
- // Return immediately if the job is running 0 tasks
-
- return new JobWaiter[U](this, jobId, 0, resultHandler)
- }
-
- assert(partitions.size > 0)
- val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
- val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
- eventProcessLoop.post(JobSubmitted( // 向队列添加一个 event, 此 event 为 JobSubmitted
-
- jobId, rdd, func2, partitions.toArray, callSite, waiter,
- SerializationUtils.clone(properties)))
- waiter
- }
submitJob方法将JobSubmitted发送到队列(并没有采用Actor模型 而是1.4版本后重新采用了1.0版本之前的队列方式),那队列是在哪消费的呢? 看下边;
在DAGScheduler类的最后有这么一段:
- // Start the event thread and register the metrics source at the end of the constructor
-
- env.metricsSystem.registerSource(metricsSource)
- eventProcessLoop.start()
DAGSchedulerEventProcessLoop 继承 EventLoop,看一下EventLoop的start方法:
- //开一个线程不断消费队列 调用onReceive()方法
-
- private val eventThread = new Thread(name) {
- setDaemon(true)
-
- override def run(): Unit = {
- try {
- while (!stopped.get) {
- val event = eventQueue.take()
- try {
- onReceive(event)
- } catch {
- case NonFatal(e) => {
- try {
- onError(e)
- } catch {
- case NonFatal(e) => logError("Unexpected error in " + name, e)
- }
- }
- }
- }
- } catch {
- case ie: InterruptedException => // exit even if eventQueue is not empty
-
- case NonFatal(e) => logError("Unexpected error in " + name, e)
- }
- }
-
- }
-
- def start(): Unit = {
- if (stopped.get) {
- throw new IllegalStateException(name + " has already been stopped")
- }
- // Call onStart before starting the event thread to make sure it happens before onReceive
-
- onStart()
- eventThread.start()
- }
接下来看 onReceive()方法:
- /**
- * The main event loop of the DAG scheduler.
- */
- override def onReceive(event: DAGSchedulerEvent): Unit = {
- val timerContext = timer.time()
- try {
- doOnReceive(event)
- } finally {
- timerContext.stop()
- }
- }
- private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
- case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
- dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
-
- case MapStageSubmitted(jobId, dependency, callSite, listener, properties) =>
- dagScheduler.handleMapStageSubmitted(jobId, dependency, callSite, listener, properties)
-
- case StageCancelled(stageId) =>
- dagScheduler.handleStageCancellation(stageId)
-
- case JobCancelled(jobId) =>
- dagScheduler.handleJobCancellation(jobId)
-
- case JobGroupCancelled(groupId) =>
- dagScheduler.handleJobGroupCancelled(groupId)
-
- case AllJobsCancelled =>
- dagScheduler.doCancelAllJobs()
-
- case ExecutorAdded(execId, host) =>
- dagScheduler.handleExecutorAdded(execId, host)
-
- case ExecutorLost(execId) =>
- dagScheduler.handleExecutorLost(execId, fetchFailed = false)
-
- case BeginEvent(task, taskInfo) =>
- dagScheduler.handleBeginEvent(task, taskInfo)
-
- case GettingResultEvent(taskInfo) =>
- dagScheduler.handleGetTaskResult(taskInfo)
-
- case completion @ CompletionEvent(task, reason, _, _, taskInfo, taskMetrics) =>
- dagScheduler.handleTaskCompletion(completion)
-
- case TaskSetFailed(taskSet, reason, exception) =>
- dagScheduler.handleTaskSetFailed(taskSet, reason, exception)
-
- case ResubmitFailedStages =>
- dagScheduler.resubmitFailedStages()
- }
private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
callSite: CallSite,
listener: JobListener,
properties: Properties) {
var finalStage: ResultStage = null
try {
// New stage creation may throw an exception if, for example, jobs are run on a
// HadoopRDD whose underlying HDFS files have been deleted.
//生成一个 finalStage, 每一个 JOB 都有一个 finalStage, 根据 job 划分出不同的 stage , RDD提交过来的肯定为最后一个RDD 所以此处封装为ResultStage
//RDD中: def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum
finalStage = newResultStage(finalRDD, func, partitions, jobId, callSite)
} catch {
case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}
val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
clearCacheLocs()
logInfo("Got job %s (%s) with %d output partitions".format(
job.jobId, callSite.shortForm, partitions.length))
logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
logInfo("Parents of final stage: " + finalStage.parents)
logInfo("Missing parents: " + getMissingParentStages(finalStage))
val jobSubmissionTime = clock.getTimeMillis()
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.resultOfJob = Some(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
submitStage(finalStage)
submitWaitingStages()
}
进入submitStage方法:finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
callSite: CallSite,
listener: JobListener,
properties: Properties) {
var finalStage: ResultStage = null
try {
// New stage creation may throw an exception if, for example, jobs are run on a
// HadoopRDD whose underlying HDFS files have been deleted.
//生成一个 finalStage, 每一个 JOB 都有一个 finalStage, 根据 job 划分出不同的 stage , RDD提交过来的肯定为最后一个RDD 所以此处封装为ResultStage
//RDD中: def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum
finalStage = newResultStage(finalRDD, func, partitions, jobId, callSite)
} catch {
case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}
val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
clearCacheLocs()
logInfo("Got job %s (%s) with %d output partitions".format(
job.jobId, callSite.shortForm, partitions.length))
logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
logInfo("Parents of final stage: " + finalStage.parents)
logInfo("Missing parents: " + getMissingParentStages(finalStage))
val jobSubmissionTime = clock.getTimeMillis()
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.resultOfJob = Some(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
submitStage(finalStage)
submitWaitingStages()
}
/** Submits stage, but first recursively submits any missing parents. */
//提交阶段,但首先递归提交的父母 首次进来为ResultStage,在此方法中递归提交父RDD组成的ShuffleMapStage private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
val missing = getMissingParentStages(stage).sortBy(_.id) //获取父Stages
logDebug("missing: " + missing)
if (missing.isEmpty) {
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
submitMissingTasks(stage, jobId.get)
} else {
for (parent <- missing) {
submitStage(parent)
}
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id, None)
}
}
//提交阶段,但首先递归提交的父母 首次进来为ResultStage,在此方法中递归提交父RDD组成的ShuffleMapStage private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
val missing = getMissingParentStages(stage).sortBy(_.id) //获取父Stages
logDebug("missing: " + missing)
if (missing.isEmpty) {
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
submitMissingTasks(stage, jobId.get)
} else {
for (parent <- missing) {
submitStage(parent)
}
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id, None)
}
}
继续进入submitMissingTasks方法:
/** Called when stage's parents are available and we can now do its task. */
// taskScheduler.submitTasks 提交
private def submitMissingTasks(stage: Stage, jobId: Int) {
logDebug("submitMissingTasks(" + stage + ")")
// Get our pending tasks and remember them in our pendingTasks entry
stage.pendingTasks.clear()
// First figure out the indexes of partition ids to compute.
val (allPartitions: Seq[Int], partitionsToCompute: Seq[Int]) = {
stage match { //stage分为两种类型 shuffleMapStage和ResultStage
case stage: ShuffleMapStage =>
val allPartitions = 0 until stage.numPartitions
val filteredPartitions = allPartitions.filter { id => stage.outputLocs(id).isEmpty }
(allPartitions, filteredPartitions)
case stage: ResultStage =>
val job = stage.resultOfJob.get
val allPartitions = 0 until job.numPartitions
val filteredPartitions = allPartitions.filter { id => !job.finished(id) }
(allPartitions, filteredPartitions)
}
}
// Create internal accumulators if the stage has no accumulators initialized.
// Reset internal accumulators only if this stage is not partially submitted
// Otherwise, we may override existing accumulator values from some tasks
if (stage.internalAccumulators.isEmpty || allPartitions == partitionsToCompute) {
stage.resetInternalAccumulators()
}
val properties = jobIdToActiveJob.get(stage.firstJobId).map(_.properties).orNull
runningStages += stage
// SparkListenerStageSubmitted should be posted before testing whether tasks are
// serializable. If tasks are not serializable, a SparkListenerStageCompleted event
// will be posted, which should always come after a corresponding SparkListenerStageSubmitted
// event.
outputCommitCoordinator.stageStart(stage.id)
val taskIdToLocations = try {
stage match {
case s: ShuffleMapStage =>
partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
case s: ResultStage =>
val job = s.resultOfJob.get
partitionsToCompute.map { id =>
val p = s.partitions(id)
(id, getPreferredLocs(stage.rdd, p))
}.toMap
}
} catch {
case NonFatal(e) =>
stage.makeNewStageAttempt(partitionsToCompute.size)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))
runningStages -= stage
return
}
stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
// the serialized copy of the RDD and for each task we will deserialize it, which means each
// task gets a different copy of the RDD. This provides stronger isolation between tasks that
// might modify state of objects referenced in their closures. This is necessary in Hadoop
// where the JobConf/Configuration object is not thread-safe.
var taskBinary: Broadcast[Array[Byte]] = null
try {
// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
// For ResultTask, serialize and broadcast (rdd, func).
val taskBinaryBytes: Array[Byte] = stage match {
case stage: ShuffleMapStage =>
closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()
case stage: ResultStage =>
closureSerializer.serialize((stage.rdd, stage.func): AnyRef).array()
}
taskBinary = sc.broadcast(taskBinaryBytes)
} catch {
// In the case of a failure during serialization, abort the stage.
case e: NotSerializableException =>
abortStage(stage, "Task not serializable: " + e.toString, Some(e))
runningStages -= stage
// Abort execution
return
case NonFatal(e) =>
abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}", Some(e))
runningStages -= stage
return
}
val tasks: Seq[Task[_]] = try {
stage match {
case stage: ShuffleMapStage =>
partitionsToCompute.map { id =>
val locs = taskIdToLocations(id)
val part = stage.rdd.partitions(id)
new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, stage.internalAccumulators)
}
case stage: ResultStage =>
val job = stage.resultOfJob.get
partitionsToCompute.map { id =>
val p: Int = stage.partitions(id)
val part = stage.rdd.partitions(p)
val locs = taskIdToLocations(id)
new ResultTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, id, stage.internalAccumulators)
}
}
} catch {
case NonFatal(e) =>
abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))
runningStages -= stage
return
}
if (tasks.size > 0) {
logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
stage.pendingTasks ++= tasks
logDebug("New pending tasks: " + stage.pendingTasks)
//把 stage 根据 partition 生成 TaskSet, 通过 TaskScheduler 提交 Task
taskScheduler.submitTasks(new TaskSet(
tasks.toArray, stage.id, stage.latestInfo.attemptId, stage.firstJobId, properties))
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
} else {
// Because we posted SparkListenerStageSubmitted earlier, we should mark
// the stage as completed here in case there are no tasks to run
markStageAsFinished(stage, None)
val debugString = stage match {
case stage: ShuffleMapStage =>
s"Stage ${stage} is actually done; " +
s"(available: ${stage.isAvailable}," +
s"available outputs: ${stage.numAvailableOutputs}," +
s"partitions: ${stage.numPartitions})"
case stage : ResultStage =>
s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"
}
logDebug(debugString)
}
}
此处不深入研究该方法看一下taskScheduler.submitTasks方法:// taskScheduler.submitTasks 提交
private def submitMissingTasks(stage: Stage, jobId: Int) {
logDebug("submitMissingTasks(" + stage + ")")
// Get our pending tasks and remember them in our pendingTasks entry
stage.pendingTasks.clear()
// First figure out the indexes of partition ids to compute.
val (allPartitions: Seq[Int], partitionsToCompute: Seq[Int]) = {
stage match { //stage分为两种类型 shuffleMapStage和ResultStage
case stage: ShuffleMapStage =>
val allPartitions = 0 until stage.numPartitions
val filteredPartitions = allPartitions.filter { id => stage.outputLocs(id).isEmpty }
(allPartitions, filteredPartitions)
case stage: ResultStage =>
val job = stage.resultOfJob.get
val allPartitions = 0 until job.numPartitions
val filteredPartitions = allPartitions.filter { id => !job.finished(id) }
(allPartitions, filteredPartitions)
}
}
// Create internal accumulators if the stage has no accumulators initialized.
// Reset internal accumulators only if this stage is not partially submitted
// Otherwise, we may override existing accumulator values from some tasks
if (stage.internalAccumulators.isEmpty || allPartitions == partitionsToCompute) {
stage.resetInternalAccumulators()
}
val properties = jobIdToActiveJob.get(stage.firstJobId).map(_.properties).orNull
runningStages += stage
// SparkListenerStageSubmitted should be posted before testing whether tasks are
// serializable. If tasks are not serializable, a SparkListenerStageCompleted event
// will be posted, which should always come after a corresponding SparkListenerStageSubmitted
// event.
outputCommitCoordinator.stageStart(stage.id)
val taskIdToLocations = try {
stage match {
case s: ShuffleMapStage =>
partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
case s: ResultStage =>
val job = s.resultOfJob.get
partitionsToCompute.map { id =>
val p = s.partitions(id)
(id, getPreferredLocs(stage.rdd, p))
}.toMap
}
} catch {
case NonFatal(e) =>
stage.makeNewStageAttempt(partitionsToCompute.size)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))
runningStages -= stage
return
}
stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
// the serialized copy of the RDD and for each task we will deserialize it, which means each
// task gets a different copy of the RDD. This provides stronger isolation between tasks that
// might modify state of objects referenced in their closures. This is necessary in Hadoop
// where the JobConf/Configuration object is not thread-safe.
var taskBinary: Broadcast[Array[Byte]] = null
try {
// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
// For ResultTask, serialize and broadcast (rdd, func).
val taskBinaryBytes: Array[Byte] = stage match {
case stage: ShuffleMapStage =>
closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()
case stage: ResultStage =>
closureSerializer.serialize((stage.rdd, stage.func): AnyRef).array()
}
taskBinary = sc.broadcast(taskBinaryBytes)
} catch {
// In the case of a failure during serialization, abort the stage.
case e: NotSerializableException =>
abortStage(stage, "Task not serializable: " + e.toString, Some(e))
runningStages -= stage
// Abort execution
return
case NonFatal(e) =>
abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}", Some(e))
runningStages -= stage
return
}
val tasks: Seq[Task[_]] = try {
stage match {
case stage: ShuffleMapStage =>
partitionsToCompute.map { id =>
val locs = taskIdToLocations(id)
val part = stage.rdd.partitions(id)
new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, stage.internalAccumulators)
}
case stage: ResultStage =>
val job = stage.resultOfJob.get
partitionsToCompute.map { id =>
val p: Int = stage.partitions(id)
val part = stage.rdd.partitions(p)
val locs = taskIdToLocations(id)
new ResultTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, id, stage.internalAccumulators)
}
}
} catch {
case NonFatal(e) =>
abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))
runningStages -= stage
return
}
if (tasks.size > 0) {
logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
stage.pendingTasks ++= tasks
logDebug("New pending tasks: " + stage.pendingTasks)
//把 stage 根据 partition 生成 TaskSet, 通过 TaskScheduler 提交 Task
taskScheduler.submitTasks(new TaskSet(
tasks.toArray, stage.id, stage.latestInfo.attemptId, stage.firstJobId, properties))
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
} else {
// Because we posted SparkListenerStageSubmitted earlier, we should mark
// the stage as completed here in case there are no tasks to run
markStageAsFinished(stage, None)
val debugString = stage match {
case stage: ShuffleMapStage =>
s"Stage ${stage} is actually done; " +
s"(available: ${stage.isAvailable}," +
s"available outputs: ${stage.numAvailableOutputs}," +
s"partitions: ${stage.numPartitions})"
case stage : ResultStage =>
s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"
}
logDebug(debugString)
}
}
override def submitTasks(taskSet: TaskSet) {
val tasks = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
this.synchronized {
//生成一个 TaskSetManager 实例,TaskSetManager负责监控该TaskSet的所有Task任务,有任务完成后,taskScheduler会从该Manager中删除task
val manager = createTaskSetManager(taskSet, maxTaskFailures)
val stage = taskSet.stageId
val stageTaskSets =
taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
stageTaskSets(taskSet.stageAttemptId) = manager
val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
ts.taskSet != taskSet && !ts.isZombie
}
if (conflictingTaskSet) {
throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
}
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties) //将manager加入到rootPool池中,在初始化schedulableBuilder时赋予rootPool池(FIFO、FAIR)
if (!isLocal && !hasReceivedTask) {
starvationTimer.scheduleAtFixedRate(new TimerTask() {
override def run() {
if (!hasLaunchedTask) {
logWarning("Initial job has not accepted any resources; " +
"check your cluster UI to ensure that workers are registered " +
"and have sufficient resources")
} else {
this.cancel()
}
}
}, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
}
hasReceivedTask = true
}
//通过 backend 发起执行消息 ,backend 是 SchedulerBackend 的具体实现,在 yarn-cluster 模式为 CoarseGrainedSchedulerBackend
backend.reviveOffers()
}
val tasks = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
this.synchronized {
//生成一个 TaskSetManager 实例,TaskSetManager负责监控该TaskSet的所有Task任务,有任务完成后,taskScheduler会从该Manager中删除task
val manager = createTaskSetManager(taskSet, maxTaskFailures)
val stage = taskSet.stageId
val stageTaskSets =
taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
stageTaskSets(taskSet.stageAttemptId) = manager
val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
ts.taskSet != taskSet && !ts.isZombie
}
if (conflictingTaskSet) {
throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
}
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties) //将manager加入到rootPool池中,在初始化schedulableBuilder时赋予rootPool池(FIFO、FAIR)
if (!isLocal && !hasReceivedTask) {
starvationTimer.scheduleAtFixedRate(new TimerTask() {
override def run() {
if (!hasLaunchedTask) {
logWarning("Initial job has not accepted any resources; " +
"check your cluster UI to ensure that workers are registered " +
"and have sufficient resources")
} else {
this.cancel()
}
}
}, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
}
hasReceivedTask = true
}
//通过 backend 发起执行消息 ,backend 是 SchedulerBackend 的具体实现,在 yarn-cluster 模式为 CoarseGrainedSchedulerBackend
backend.reviveOffers()
}
CoarseGrainedSchedulerBackend:
override def reviveOffers() {
driverEndpoint.send(ReviveOffers)
}
可见reviveOffers方法是向schedulerBackend发送了一个ReviveOffers消息。schedulerBackend为Actor模型看一下start方法,taskScheduler中初始化时会调用start()方法:driverEndpoint.send(ReviveOffers)
}
override def start() {
val properties = new ArrayBuffer[(String, String)]
for ((key, value) <- scheduler.sc.conf.getAll) {
if (key.startsWith("spark.")) {
properties += ((key, value))
}
}
// TODO (prashant) send conf instead of properties
// 初始化话 endpoint 用来通讯
driverEndpoint = rpcEnv.setupEndpoint(ENDPOINT_NAME, createDriverEndpoint(properties))
}
//创建一个Actor
protected def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
new DriverEndpoint(rpcEnv, properties)
}
val properties = new ArrayBuffer[(String, String)]
for ((key, value) <- scheduler.sc.conf.getAll) {
if (key.startsWith("spark.")) {
properties += ((key, value))
}
}
// TODO (prashant) send conf instead of properties
// 初始化话 endpoint 用来通讯
driverEndpoint = rpcEnv.setupEndpoint(ENDPOINT_NAME, createDriverEndpoint(properties))
}
//创建一个Actor
new DriverEndpoint(rpcEnv, properties)
}
看一下receive方法:
override def receive: PartialFunction[Any, Unit] = {
case StatusUpdate(executorId, taskId, state, data) =>
scheduler.statusUpdate(taskId, state, data.value)
if (TaskState.isFinished(state)) {
executorDataMap.get(executorId) match {
case Some(executorInfo) =>
executorInfo.freeCores += scheduler.CPUS_PER_TASK
makeOffers(executorId)
case None =>
// Ignoring the update since we don't know about the executor.
logWarning(s"Ignored task status update ($taskId state $state) " +
s"from unknown executor with ID $executorId")
}
}
case ReviveOffers => //在初始化时会每隔一定时间(spark.scheduler.revive.interval, 默认为1s)进行一次调度(给自身发送ReviveOffers消息, 进行调用makeOffers进行调度)。
makeOffers()
case KillTask(taskId, executorId, interruptThread) =>
executorDataMap.get(executorId) match {
case Some(executorInfo) =>
executorInfo.executorEndpoint.send(KillTask(taskId, executorId, interruptThread))
case None =>
// Ignoring the task kill since the executor is not registered.
logWarning(s"Attempted to kill task $taskId for unknown executor $executorId.")
}
}
看红色部分进入makeOffers()方法:case StatusUpdate(executorId, taskId, state, data) =>
scheduler.statusUpdate(taskId, state, data.value)
if (TaskState.isFinished(state)) {
executorDataMap.get(executorId) match {
case Some(executorInfo) =>
executorInfo.freeCores += scheduler.CPUS_PER_TASK
makeOffers(executorId)
case None =>
// Ignoring the update since we don't know about the executor.
logWarning(s"Ignored task status update ($taskId state $state) " +
s"from unknown executor with ID $executorId")
}
}
case ReviveOffers => //在初始化时会每隔一定时间(spark.scheduler.revive.interval, 默认为1s)进行一次调度(给自身发送ReviveOffers消息, 进行调用makeOffers进行调度)。
makeOffers()
case KillTask(taskId, executorId, interruptThread) =>
executorDataMap.get(executorId) match {
case Some(executorInfo) =>
executorInfo.executorEndpoint.send(KillTask(taskId, executorId, interruptThread))
case None =>
// Ignoring the task kill since the executor is not registered.
logWarning(s"Attempted to kill task $taskId for unknown executor $executorId.")
}
}
// Make fake resource offers on all executors
private def makeOffers() {
// Filter out executors under killing
val activeExecutors = executorDataMap.filterKeys(!executorsPendingToRemove.contains(_)) //过滤出executor
val workOffers = activeExecutors.map { case (id, executorData) =>
new WorkerOffer(id, executorData.executorHost, executorData.freeCores) //转为seq
}.toSeq
launchTasks(scheduler.resourceOffers(workOffers))
}
重点是TaskScheduler的resourceOffers()方法:private def makeOffers() {
// Filter out executors under killing
val activeExecutors = executorDataMap.filterKeys(!executorsPendingToRemove.contains(_)) //过滤出executor
val workOffers = activeExecutors.map { case (id, executorData) =>
new WorkerOffer(id, executorData.executorHost, executorData.freeCores) //转为seq
}.toSeq
launchTasks(scheduler.resourceOffers(workOffers))
}
def resourceOffers(offers: Seq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized { //SchedulerBackend中调用 封装TaskDescription 发送给Executor端执行
// Mark each slave as alive and remember its hostname
// Also track if new executor is added
var newExecAvail = false
// 遍历worker提供的资源,更新executor相关的映射
for (o <- offers) {
executorIdToHost(o.executorId) = o.host
activeExecutorIds += o.executorId
if (!executorsByHost.contains(o.host)) {
executorsByHost(o.host) = new HashSet[String]()
executorAdded(o.executorId, o.host)
newExecAvail = true
}
for (rack <- getRackForHost(o.host)) {
hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
}
}
// 从worker当中随机选出一些来,防止任务都堆在一个机器上
// Randomly shuffle offers to avoid always placing tasks on the same set of workers.
val shuffledOffers = Random.shuffle(offers) //将offers集合打散顺序
// Build a list of tasks to assign to each worker.
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
val availableCpus = shuffledOffers.map(o => o.cores).toArray
// getSortedTask函数对taskset进行排序
val sortedTaskSets = rootPool.getSortedTaskSetQueue //排序队列 获取到一组 TaskSetManager :ArrayBuffer[TaskSetManager]
//按就近原则进行Task调度
for (taskSet <- sortedTaskSets) {
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
taskSet.parent.name, taskSet.name, taskSet.runningTasks))
if (newExecAvail) {
taskSet.executorAdded()
}
}
// Take each TaskSet in our scheduling order, and then offer it each node in increasing order
// of locality levels so that it gets a chance to launch local tasks on all of them.
// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
var launchedTask = false
for (taskSet <- sortedTaskSets; maxLocality <- taskSet.myLocalityLevels) {
do {
launchedTask = resourceOfferSingleTaskSet(
taskSet, maxLocality, shuffledOffers, availableCpus, tasks)
} while (launchedTask)
}
if (tasks.size > 0) {
hasLaunchedTask = true
}
return tasks
}
看一下resourceOfferSingleTaskSet():// Mark each slave as alive and remember its hostname
// Also track if new executor is added
var newExecAvail = false
// 遍历worker提供的资源,更新executor相关的映射
for (o <- offers) {
executorIdToHost(o.executorId) = o.host
activeExecutorIds += o.executorId
if (!executorsByHost.contains(o.host)) {
executorsByHost(o.host) = new HashSet[String]()
executorAdded(o.executorId, o.host)
newExecAvail = true
}
for (rack <- getRackForHost(o.host)) {
hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
}
}
// 从worker当中随机选出一些来,防止任务都堆在一个机器上
// Randomly shuffle offers to avoid always placing tasks on the same set of workers.
val shuffledOffers = Random.shuffle(offers) //将offers集合打散顺序
// Build a list of tasks to assign to each worker.
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
val availableCpus = shuffledOffers.map(o => o.cores).toArray
// getSortedTask函数对taskset进行排序
val sortedTaskSets = rootPool.getSortedTaskSetQueue //排序队列 获取到一组 TaskSetManager :ArrayBuffer[TaskSetManager]
//按就近原则进行Task调度
for (taskSet <- sortedTaskSets) {
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
taskSet.parent.name, taskSet.name, taskSet.runningTasks))
if (newExecAvail) {
taskSet.executorAdded()
}
}
// Take each TaskSet in our scheduling order, and then offer it each node in increasing order
// of locality levels so that it gets a chance to launch local tasks on all of them.
// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
var launchedTask = false
for (taskSet <- sortedTaskSets; maxLocality <- taskSet.myLocalityLevels) {
do {
launchedTask = resourceOfferSingleTaskSet(
taskSet, maxLocality, shuffledOffers, availableCpus, tasks)
} while (launchedTask)
}
if (tasks.size > 0) {
hasLaunchedTask = true
}
return tasks
}
private def resourceOfferSingleTaskSet(
taskSet: TaskSetManager,
maxLocality: TaskLocality,
shuffledOffers: Seq[WorkerOffer],
availableCpus: Array[Int],
tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = {
var launchedTask = false
for (i <- 0 until shuffledOffers.size) { // 遍历所有Executor
val execId = shuffledOffers(i).executorId
val host = shuffledOffers(i).host
if (availableCpus(i) >= CPUS_PER_TASK) { // >1
try {
for (task <- taskSet.resourceOffer(execId, host, maxLocality)) { // 根据executor的信息返回一个最合适的Task
// resourceOffer :
//根据TaskScheduler所提供的单个Resource资源包括host,executor和locality的要求返回一个合适的Task。
// TaskSetManager内部会根据上一个任务成功提交的时间,自动调整自身的Locality匹配策略,
// 如果上一次成功提交任务的时间间隔很长,则降低对Locality的要求(例如从最差要求Process Local降低为最差要求Node Local),
// 反之则提高对Locality的要求。这一动态调整Locality策略基本可以理解为是为了提高任务在最佳Locality的情况下得到运行的机会,
// 因为Resource资源可能是在短期内分批提供给TaskSetManager的,动态调整Locality门槛有助于改善整体的Locality分布情况。
tasks(i) += task
val tid = task.taskId
taskIdToTaskSetManager(tid) = taskSet
taskIdToExecutorId(tid) = execId
executorsByHost(host) += execId
availableCpus(i) -= CPUS_PER_TASK
assert(availableCpus(i) >= 0)
launchedTask = true
}
} catch {
case e: TaskNotSerializableException =>
logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
// Do not offer resources for this task, but don't throw an error to allow other
// task sets to be submitted.
return launchedTask
}
}
}
return launchedTask
}
taskSet: TaskSetManager,
maxLocality: TaskLocality,
shuffledOffers: Seq[WorkerOffer],
availableCpus: Array[Int],
tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = {
var launchedTask = false
for (i <- 0 until shuffledOffers.size) { // 遍历所有Executor
val execId = shuffledOffers(i).executorId
val host = shuffledOffers(i).host
if (availableCpus(i) >= CPUS_PER_TASK) { // >1
try {
for (task <- taskSet.resourceOffer(execId, host, maxLocality)) { // 根据executor的信息返回一个最合适的Task
// resourceOffer :
//根据TaskScheduler所提供的单个Resource资源包括host,executor和locality的要求返回一个合适的Task。
// TaskSetManager内部会根据上一个任务成功提交的时间,自动调整自身的Locality匹配策略,
// 如果上一次成功提交任务的时间间隔很长,则降低对Locality的要求(例如从最差要求Process Local降低为最差要求Node Local),
// 反之则提高对Locality的要求。这一动态调整Locality策略基本可以理解为是为了提高任务在最佳Locality的情况下得到运行的机会,
// 因为Resource资源可能是在短期内分批提供给TaskSetManager的,动态调整Locality门槛有助于改善整体的Locality分布情况。
tasks(i) += task
val tid = task.taskId
taskIdToTaskSetManager(tid) = taskSet
taskIdToExecutorId(tid) = execId
executorsByHost(host) += execId
availableCpus(i) -= CPUS_PER_TASK
assert(availableCpus(i) >= 0)
launchedTask = true
}
} catch {
case e: TaskNotSerializableException =>
logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
// Do not offer resources for this task, but don't throw an error to allow other
// task sets to be submitted.
return launchedTask
}
}
}
return launchedTask
}
launchTasks()方法:
// Launch tasks returned by a set of resource offers
private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
for (task <- tasks.flatten) {
val serializedTask = ser.serialize(task)
if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>
try {
var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
"spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " +
"spark.akka.frameSize or using broadcast variables for large values."
msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize,
AkkaUtils.reservedSizeBytes)
taskSetMgr.abort(msg)
} catch {
case e: Exception => logError("Exception in error callback", e)
}
}
}
else {
val executorData = executorDataMap(task.executorId)
executorData.freeCores -= scheduler.CPUS_PER_TASK
executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask))) // 获取到executorEndpoint并发射到ExecutorBackend
}
}
}
找到CoarseGrainedExecutorBackend的receive方法:private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
for (task <- tasks.flatten) {
val serializedTask = ser.serialize(task)
if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>
try {
var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
"spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " +
"spark.akka.frameSize or using broadcast variables for large values."
msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize,
AkkaUtils.reservedSizeBytes)
taskSetMgr.abort(msg)
} catch {
case e: Exception => logError("Exception in error callback", e)
}
}
}
else {
val executorData = executorDataMap(task.executorId)
executorData.freeCores -= scheduler.CPUS_PER_TASK
executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask))) // 获取到executorEndpoint并发射到ExecutorBackend
}
}
}
override def receive: PartialFunction[Any, Unit] = {
case RegisteredExecutor =>
logInfo("Successfully registered with driver")
val (hostname, _) = Utils.parseHostPort(hostPort)
executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false)
case RegisterExecutorFailed(message) =>
logError("Slave registration failed: " + message)
System.exit(1)
case LaunchTask(data) => // 接收任务
if (executor == null) {
logError("Received LaunchTask command but executor was null")
System.exit(1)
} else {
val taskDesc = ser.deserialize[TaskDescription](data.value)
logInfo("Got assigned task " + taskDesc.taskId)
executor.launchTask(this, taskId = taskDesc.taskId, attemptNumber = taskDesc.attemptNumber, // 交由Executor执行
taskDesc.name, taskDesc.serializedTask)
}
case KillTask(taskId, _, interruptThread) =>
if (executor == null) {
logError("Received KillTask command but executor was null")
System.exit(1)
} else {
executor.killTask(taskId, interruptThread)
}
case StopExecutor =>
logInfo("Driver commanded a shutdown")
executor.stop()
stop()
rpcEnv.shutdown()
}
Executor中有一个线程池对象,每分发一个任务就会启一个线程去执行Task。case RegisteredExecutor =>
logInfo("Successfully registered with driver")
val (hostname, _) = Utils.parseHostPort(hostPort)
executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false)
case RegisterExecutorFailed(message) =>
logError("Slave registration failed: " + message)
System.exit(1)
case LaunchTask(data) => // 接收任务
if (executor == null) {
logError("Received LaunchTask command but executor was null")
System.exit(1)
} else {
val taskDesc = ser.deserialize[TaskDescription](data.value)
logInfo("Got assigned task " + taskDesc.taskId)
executor.launchTask(this, taskId = taskDesc.taskId, attemptNumber = taskDesc.attemptNumber, // 交由Executor执行
taskDesc.name, taskDesc.serializedTask)
}
case KillTask(taskId, _, interruptThread) =>
if (executor == null) {
logError("Received KillTask command but executor was null")
System.exit(1)
} else {
executor.killTask(taskId, interruptThread)
}
case StopExecutor =>
logInfo("Driver commanded a shutdown")
executor.stop()
stop()
rpcEnv.shutdown()
}
简要总结:
DAGScheduler:
1.handleJobSubmitted(): 根据提交过来的最后一个RDD封装成一个ResultStage
2.submitStage(): 根据这个ResultStage递归提交父RDD组成的ShuffleMapStage
3.submitMissingTasks() : 根据Stage封装成ShuffleMapTask或者ResultTask再封装成TaskSet调用TaskScheduler提交
TaskScheduler:
1.submitTasks() : 将TaskSet封装成TaskSetManager提交至Executor端
2.launchTasks() : 通过resourceOffers()方法获取到TaskDescription集合后
通过再通过该方法提交至ExecutorBackend
SchedulerBackend:
1.makeOffers() : 该方法调用TaskScheduler的resourceOffers(workOffers)
workOffers为executorBackend注册的子节点信息
2.resourceOffers() : 通过该方法将workOffers顺序打散后再从队列中获取到一组
TaskSetManager,然后调用TaskSetManager的resourceOffer()
方法获取到每个节点上最合适的Task并封装成TaskDescription
Executor :
1.线程在run()中将Task反序列化解析出来调用run()方法
2.run()方法中调用子类(ShuffleMapTask、ResultTask)的runTask()方法
3.runTask中调用RDD的iterator()方法,迭代计算父RDD。
4.在iterator()方法中如果当前RDD的storage level变量标记不是NONE的话,表示该RDD调用了缓存方法,在BlockManager中应有存储,那么调用
CacheManager中的getOrCompute()函数计算RDD
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29754888/viewspace-1813916/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/29754888/viewspace-1813916/