spark源码阅读二-spark job执行

最新推荐文章于 2022-02-17 19:26:33 发布

crackwl

最新推荐文章于 2022-02-17 19:26:33 发布

阅读量376

点赞数

分类专栏：大数据文章标签： spark 源码 scala 分布式

本文链接：https://blog.csdn.net/crackwl/article/details/76602630

版权

大数据专栏收录该内容

8 篇文章 0 订阅

订阅专栏

本篇文章主要讲解driver进程spark context runJob函数执行后，最终如何提交在executor机器上分布式运行的。整个过程涉及2种进程，driver和executor。

1.job提交和stage划分

def runJob[T, U: ClassTag](

rdd: RDD[T],

func: (TaskContext, Iterator[T]) => U,

partitions: Seq[Int],

resultHandler: (Int, U) => Unit): Unit = {

if ( stopped.get()) {

throw new IllegalStateException( "SparkContext has been shutdown")

}

val callSite = getCallSite

val cleanedFunc = clean(func)

logInfo( "Starting job: " + callSite.shortForm)

if (conf.getBoolean( "spark.logLineage", false)) {

logInfo( "RDD's recursive dependencies: \n " + rdd.toDebugString)

}

dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)

progressBar.foreach(_.finishAll())

rdd.doCheckpoint()

}

首先是dagScheduler.runJob

def runJob[T, U](

rdd: RDD[T],

func: (TaskContext, Iterator[T]) => U,

partitions: Seq[Int],

callSite: CallSite,

resultHandler: (Int, U) => Unit,

properties: Properties): Unit = {

val start = System. nanoTime

val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)

ThreadUtils. awaitReady(waiter.completionFuture, Duration. Inf)

waiter.completionFuture.value.get match {

case scala.util. Success(_) =>

logInfo( "Job %d finished: %s, took %f s".format

(waiter.jobId, callSite.shortForm, (System. nanoTime - start) / 1e9))

case scala.util. Failure(exception) =>

logInfo( "Job %d failed: %s, took %f s".format

(waiter.jobId, callSite.shortForm, (System. nanoTime - start) / 1e9))

// SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler. val callerStackTrace = Thread. currentThread().getStackTrace.tail

exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)

throw exception

}

dagScheduler submitJob后就等待job执行完成。看看submitJob实现：

def submitJob[T, U](

rdd: RDD[T],

func: (TaskContext, Iterator[T]) => U,

partitions: Seq[Int],

callSite: CallSite,

resultHandler: (Int, U) => Unit,

properties: Properties): JobWaiter[U] = {

// Check to make sure we are not launching a task on a partition that does not exist. val maxPartitions = rdd.partitions.length

partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>

throw new IllegalArgumentException(

"Attempting to access a non-existent partition: " + p + ". " +

"Total number of partitions: " + maxPartitions)

}

val jobId = nextJobId.getAndIncrement()

if (partitions.size == 0) {

// Return immediately if the job is running 0 tasks return new JobWaiter[U]( this, jobId, 0, resultHandler)

}

assert(partitions.size > 0)

val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]

val waiter = new JobWaiter( this, jobId, partitions.size, resultHandler)

eventProcessLoop.post( JobSubmitted(

jobId, rdd, func2, partitions.toArray, callSite, waiter,

SerializationUtils. clone(properties)))

waiter

}

首先生成jobid，然后向dagScheduler事件队列发送消息eventProcessLoop.post

dagScheduler事件队列收到 JobSubmitted 消息处理函数handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)

private[scheduler] def handleJobSubmitted(jobId: Int,

finalRDD: RDD[_],

func: (TaskContext, Iterator[_]) => _,

partitions: Array[Int],

callSite: CallSite,

listener: JobListener,

properties: Properties) {

var finalStage: ResultStage = null

try {

// New stage creation may throw an exception if, for example, jobs are run on a

// HadoopRDD whose underlying HDFS files have been deleted.

finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)

} catch {

case e: Exception =>

logWarning( "Creating new stage failed due to exception - job: " + jobId, e)

listener.jobFailed(e)

return }

val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)

clearCacheLocs()

logInfo( "Got job %s (%s) with %d output partitions".format(

job.jobId, callSite.shortForm, partitions.length))

logInfo( "Final stage: " + finalStage + " (" + finalStage. name + ")")

logInfo( "Parents of final stage: " + finalStage.parents)

logInfo( "Missing parents: " + getMissingParentStages(finalStage))

val jobSubmissionTime = clock.getTimeMillis()

jobIdToActiveJob(jobId) = job

activeJobs += job

finalStage.setActiveJob(job)

val stageIds = jobIdToStageIds(jobId).toArray

val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))

listenerBus.post(

SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))

submitStage(finalStage)

}

handleJobSubmitted有3个工作：生成final stage、生成ActiveJob、提交finalstage。

（1）生成final stage

private def createResultStage(

rdd: RDD[_],

func: (TaskContext, Iterator[_]) => _,

partitions: Array[Int],

jobId: Int,

callSite: CallSite): ResultStage = {

val parents = getOrCreateParentStages(rdd, jobId)

val id = nextStageId.getAndIncrement()

val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)

stageIdToStage(id) = stage

updateJobIdStageIdMaps(jobId, stage)

stage

}

这个函数很重要，从这里可以看到 dagScheduler根据RDD DAG分析生成stage。stage是如何划分呢？就是判断是否存在ShuffleDependency，如果有的话就需要新建一个stage。 ShuffleRDD、CoGroupedRDD、SubtractedRDD会返回 ShuffleDependency，相关联的算子有sortByKey、 reduceByKey、 groupByKey、 cogroupByKey、 join、 cartesian。

private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {

getShuffleDependencies(rdd).map { shuffleDep =>

getOrCreateShuffleMapStage(shuffleDep, firstJobId)

}.toList

}

注意这里是一次性把finalstage之前的所有stage都创建完了。

（2）生成ActiveJob

new ActiveJob(jobId, finalStage, callSite, listener, properties)

（3）提交finalstage

submitStage(finalStage)

private def submitStage(stage: Stage) {

val jobId = activeJobForStage(stage)

if (jobId.isDefined) {

logDebug( "submitStage(" + stage + ")")

if (! waitingStages(stage) && ! runningStages(stage) && ! failedStages(stage)) {

val missing = getMissingParentStages(stage).sortBy(_.id)

logDebug( "missing: " + missing)

if (missing.isEmpty) {

logInfo( "Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")

submitMissingTasks(stage, jobId.get)

} else {

for (parent <- missing) {

submitStage(parent)

}

waitingStages += stage

}

} else {

abortStage(stage, "No active job for stage " + stage.id, None)

}

这个函数我们可以看出2个重要信息：一是所有stage都有3个状态（等待执行、执行中、执行失败），分别由3个集合保存；二是选取执行的stage是以result stage为根进行广度优先遍历找父stage，直到找到一个没有依赖stage的stage（最开始应该是找到树最底层的叶子stage），就开始执行。

2.task创建和分发

submitMissingTasks负责创建task。

根据前面的stage划分，stage分为ShuffleMapStage和ResultStage；对应的创建的task就是ShuffleMapTask和ResultTask。

看一个代码片段

val tasks: Seq[Task[_]] = try {

val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()

stage match {

case stage: ShuffleMapStage =>

stage. pendingPartitions.clear()

partitionsToCompute.map { id =>

val locs = taskIdToLocations(id)

val part = stage.rdd.partitions(id)

stage. pendingPartitions += id

new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,

taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),

Option(sc.applicationId), sc.applicationAttemptId)

}

case stage: ResultStage =>

partitionsToCompute.map { id =>

val p: Int = stage.partitions(id)

val part = stage.rdd.partitions(p)

val locs = taskIdToLocations(id)

new ResultTask(stage.id, stage.latestInfo.attemptId,

taskBinary, part, locs, id, properties, serializedTaskMetrics,

Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)

}

在RDD窄依赖情况下，一个stage里面包含多个RDD，每个RDD有多个partition，从代码分析ShuffleMapStage每个partition都可以对应生成一个ShuffleMapTask，即按照partition可以做task并行处理。

taskScheduler.submitTasks( new TaskSet(tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))

轮到taskScheduler上场了，将一个stage下的task归类到一个taskset，并提交这些tasks。

前面在分析application运行过程时，说明了taskScheduler是根据master和deploymode生成的。SparkContext. createTaskScheduler( this, master, deployMode)， standalone是生成TaskSchedulerImpl，我们看看TaskSchedulerImpl submitTasks代码。

override def submitTasks(taskSet: TaskSet) {

val tasks = taskSet.tasks

logInfo( "Adding task set " + taskSet. id + " with " + tasks.length + " tasks")

this.synchronized {

val manager = createTaskSetManager(taskSet, maxTaskFailures)

val stage = taskSet.stageId

val stageTaskSets =

taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])

stageTaskSets(taskSet.stageAttemptId) = manager

val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>

ts.taskSet != taskSet && !ts. isZombie }

if (conflictingTaskSet) {

throw new IllegalStateException( s"more than one active taskSet for stage $stage :" +

s" ${stageTaskSets.toSeq.map{_._2.taskSet. id}.mkString( ",")} ")

}

schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)

if (!isLocal && ! hasReceivedTask) {

starvationTimer.scheduleAtFixedRate( new TimerTask() {

override def run() {

if (! hasLaunchedTask) {

logWarning( "Initial job has not accepted any resources; " +

"check your cluster UI to ensure that workers are registered " +

"and have sufficient resources")

} else {

this.cancel()

}

}, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)

}

hasReceivedTask = true }

backend.reviveOffers()

}

主要功能创建createTaskSetManager，使用schedular backend发送ReviveOffers消息给Driver.

crackwl

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark源码阅读二-spark job执行

本篇文章主要讲解driver进程spark context runJob函数执行后，最终如何提交在executor机器上分布式运行的。整个过程涉及2种进程，driver和executor。1.job提交和stage划分def runJob[T, U: ClassTag]( rdd: RDD[T], func: (TaskContext, Iterator[T]) =>
复制链接

扫一扫

专栏目录