spark源码阅读二-spark job执行

本篇文章主要讲解driver进程spark context runJob函数执行后,最终如何提交在executor机器上分布式运行的。整个过程涉及2种进程,driver和executor。
1.job提交和stage划分
def runJob[T, U: ClassTag](
    rdd: RDD[T],
    func: (TaskContext, Iterator[T]) => U,
    partitions: Seq[Int],
    resultHandler: (Int, U) => Unit): Unit = {
  if ( stopped.get()) {
    throw new IllegalStateException( "SparkContext has been shutdown")
  }
  val callSite = getCallSite
  val cleanedFunc = clean(func)
  logInfo( "Starting job: " + callSite.shortForm)
  if (conf.getBoolean( "spark.logLineage", false)) {
    logInfo( "RDD's recursive dependencies: \n " + rdd.toDebugString)
  }
  dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
  progressBar.foreach(_.finishAll())
  rdd.doCheckpoint()
}
首先是dagScheduler.runJob
def runJob[T, U](
    rdd: RDD[T],
    func: (TaskContext, Iterator[T]) => U,
    partitions: Seq[Int],
    callSite: CallSite,
    resultHandler: (Int, U) => Unit,
    properties: Properties): Unit = {
  val start = System. nanoTime   
  val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
  ThreadUtils. awaitReady(waiter.completionFuture, Duration. Inf)
  waiter.completionFuture.value.get match {
    case scala.util. Success(_) =>
      logInfo( "Job %d finished: %s, took %f s".format
        (waiter.jobId, callSite.shortForm, (System. nanoTime - start) / 1e9))
    case scala.util. Failure(exception) =>
      logInfo( "Job %d failed: %s, took %f s".format
        (waiter.jobId, callSite.shortForm, (System. nanoTime - start) / 1e9))
      // SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.       val callerStackTrace = Thread. currentThread().getStackTrace.tail
      exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
      throw exception
  }
}
dagScheduler submitJob后就等待job执行完成。看看submitJob实现:
def submitJob[T, U](
    rdd: RDD[T],
    func: (TaskContext, Iterator[T]) => U,
    partitions: Seq[Int],
    callSite: CallSite,
    resultHandler: (Int, U) => Unit,
    properties: Properties): JobWaiter[U] = {
  // Check to make sure we are not launching a task on a partition that does not exist.   val maxPartitions = rdd.partitions.length
  partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
    throw new IllegalArgumentException(
      "Attempting to access a non-existent partition: " + p + ". " +
        "Total number of partitions: " + maxPartitions)
  }

  val jobId = nextJobId.getAndIncrement()
  if (partitions.size == 0) {
    // Return immediately if the job is running 0 tasks     return new JobWaiter[U]( this, jobId, 0, resultHandler)
  }

  assert(partitions.size > 0)
  val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
  val waiter = new JobWaiter( this, jobId, partitions.size, resultHandler)
  eventProcessLoop.post( JobSubmitted(
    jobId, rdd, func2, partitions.toArray, callSite, waiter,
    SerializationUtils. clone(properties)))
  waiter
}
首先生成jobid,然后向dagScheduler事件队列发送消息eventProcessLoop.post
dagScheduler事件队列收到 JobSubmitted 消息处理函数handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
private[scheduler] def handleJobSubmitted(jobId: Int,
    finalRDD: RDD[_],
    func: (TaskContext, Iterator[_]) => _,
    partitions: Array[Int],
    callSite: CallSite,
    listener: JobListener,
    properties: Properties) {
  var finalStage: ResultStage = null   
  try {
    // New stage creation may throw an exception if, for example, jobs are run on a     
    // HadoopRDD whose underlying HDFS files have been deleted.     
    finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
  } catch {
    case e: Exception =>
      logWarning( "Creating new stage failed due to exception - job: " + jobId, e)
      listener.jobFailed(e)
      return   }

  val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
  clearCacheLocs()
  logInfo( "Got job %s (%s) with %d output partitions".format(
    job.jobId, callSite.shortForm, partitions.length))
  logInfo( "Final stage: " + finalStage + " (" + finalStage. name + ")")
  logInfo( "Parents of final stage: " + finalStage.parents)
  logInfo( "Missing parents: " + getMissingParentStages(finalStage))

  val jobSubmissionTime = clock.getTimeMillis()
  jobIdToActiveJob(jobId) = job
  activeJobs += job
  finalStage.setActiveJob(job)
  val stageIds = jobIdToStageIds(jobId).toArray
  val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
  listenerBus.post(
    SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
  submitStage(finalStage)
}
handleJobSubmitted有3个工作:生成final stage、生成ActiveJob、提交finalstage。
(1)生成final stage
private def createResultStage(
    rdd: RDD[_],
    func: (TaskContext, Iterator[_]) => _,
    partitions: Array[Int],
    jobId: Int,
    callSite: CallSite): ResultStage = {
  val parents = getOrCreateParentStages(rdd, jobId)
  val id = nextStageId.getAndIncrement()
  val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
  stageIdToStage(id) = stage
  updateJobIdStageIdMaps(jobId, stage)
  stage
}
这个函数很重要,从这里可以看到 dagScheduler根据RDD DAG分析生成stage。stage是如何划分呢?就是判断是否存在ShuffleDependency,如果有的话就需要新建一个stage。 ShuffleRDD、CoGroupedRDD、SubtractedRDD会返回 ShuffleDependency,相关联的算子有sortByKey、 reduceByKey、 groupByKey、 cogroupByKey、 join、  cartesian。
private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
  getShuffleDependencies(rdd).map { shuffleDep =>
    getOrCreateShuffleMapStage(shuffleDep, firstJobId)
  }.toList
}
注意这里是一次性把finalstage之前的所有stage都创建完了。
(2)生成ActiveJob
new ActiveJob(jobId, finalStage, callSite, listener, properties)
(3)提交finalstage
submitStage(finalStage)
private def submitStage(stage: Stage) {
  val jobId = activeJobForStage(stage)
  if (jobId.isDefined) {
    logDebug( "submitStage(" + stage + ")")
    if (! waitingStages(stage) && ! runningStages(stage) && ! failedStages(stage)) {
      val missing = getMissingParentStages(stage).sortBy(_.id)
      logDebug( "missing: " + missing)
      if (missing.isEmpty) {
        logInfo( "Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
        submitMissingTasks(stage, jobId.get)
      } else {
        for (parent <- missing) {
          submitStage(parent)
        }
        waitingStages += stage
      }
    }
  } else {
    abortStage(stage, "No active job for stage " + stage.id, None)
  }
}
这个函数我们可以看出2个重要信息:一是所有stage都有3个状态(等待执行、执行中、执行失败),分别由3个集合保存;二是选取执行的stage是以result stage为根进行广度优先遍历找父stage,直到找到一个没有依赖stage的stage(最开始应该是找到树最底层的叶子stage),就开始执行。

2.task创建和分发
submitMissingTasks负责创建task。
根据前面的stage划分,stage分为ShuffleMapStage和ResultStage;对应的创建的task就是ShuffleMapTask和ResultTask。
看一个代码片段
val tasks: Seq[Task[_]] = try {
  val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
  stage match {
    case stage: ShuffleMapStage =>
      stage. pendingPartitions.clear()
      partitionsToCompute.map { id =>
        val locs = taskIdToLocations(id)
        val part = stage.rdd.partitions(id)
        stage. pendingPartitions += id
        new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
          taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
          Option(sc.applicationId), sc.applicationAttemptId)
      }

    case stage: ResultStage =>
      partitionsToCompute.map { id =>
        val p: Int = stage.partitions(id)
        val part = stage.rdd.partitions(p)
        val locs = taskIdToLocations(id)
        new ResultTask(stage.id, stage.latestInfo.attemptId,
          taskBinary, part, locs, id, properties, serializedTaskMetrics,
          Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
      }
  }
}
在RDD窄依赖情况下,一个stage里面包含多个RDD,每个RDD有多个partition,从代码分析ShuffleMapStage每个partition都可以对应生成一个ShuffleMapTask,即按照partition可以做task并行处理。
taskScheduler.submitTasks( new TaskSet(tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
轮到taskScheduler上场了,将一个stage下的task归类到一个taskset,并提交这些tasks。
前面在分析application运行过程时,说明了taskScheduler是根据master和deploymode生成的。SparkContext. createTaskScheduler( this, master, deployMode), standalone是生成TaskSchedulerImpl,我们看看TaskSchedulerImpl submitTasks代码。
override def submitTasks(taskSet: TaskSet) {
  val tasks = taskSet.tasks
  logInfo( "Adding task set " + taskSet. id + " with " + tasks.length + " tasks")
  this.synchronized {
    val manager = createTaskSetManager(taskSet, maxTaskFailures)
    val stage = taskSet.stageId
    val stageTaskSets =
      taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
    stageTaskSets(taskSet.stageAttemptId) = manager
    val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
      ts.taskSet != taskSet && !ts. isZombie     }
    if (conflictingTaskSet) {
      throw new IllegalStateException( s"more than one active taskSet for stage $stage :" +
        s" ${stageTaskSets.toSeq.map{_._2.taskSet. id}.mkString( ",")} ")
    }
    schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)

    if (!isLocal && ! hasReceivedTask) {
      starvationTimer.scheduleAtFixedRate( new TimerTask() {
        override def run() {
          if (! hasLaunchedTask) {
            logWarning( "Initial job has not accepted any resources; " +
              "check your cluster UI to ensure that workers are registered " +
              "and have sufficient resources")
          } else {
            this.cancel()
          }
        }
      }, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
    }
    hasReceivedTask = true   }
  backend.reviveOffers()
}
主要功能创建createTaskSetManager,使用schedular backend发送ReviveOffers消息给Driver. 
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值