本篇文章主要讲解driver进程spark context runJob函数执行后,最终如何提交在executor机器上分布式运行的。整个过程涉及2种进程,driver和executor。
1.job提交和stage划分
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
resultHandler: (Int, U) => Unit): Unit = {
if (
stopped.get()) {
throw new IllegalStateException(
"SparkContext has been shutdown")
}
val callSite = getCallSite
val cleanedFunc = clean(func)
logInfo(
"Starting job: " + callSite.shortForm)
if (conf.getBoolean(
"spark.logLineage",
false)) {
logInfo(
"RDD's recursive dependencies:
\n
" + rdd.toDebugString)
}
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler,
localProperties.get)
progressBar.foreach(_.finishAll())
rdd.doCheckpoint()
}
首先是dagScheduler.runJob
def runJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): Unit = {
val start = System.
nanoTime
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
ThreadUtils.
awaitReady(waiter.completionFuture, Duration.
Inf)
waiter.completionFuture.value.get
match {
case scala.util.
Success(_) =>
logInfo(
"Job %d finished: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.
nanoTime - start) / 1e9))
case scala.util.
Failure(exception) =>
logInfo(
"Job %d failed: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.
nanoTime - start) / 1e9))
// SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
val callerStackTrace = Thread.
currentThread().getStackTrace.tail
exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
throw exception
}
}
dagScheduler submitJob后就等待job执行完成。看看submitJob实现:
def submitJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): JobWaiter[U] = {
// Check to make sure we are not launching a task on a partition that does not exist.
val maxPartitions = rdd.partitions.length
partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
throw new IllegalArgumentException(
"Attempting to access a non-existent partition: " + p +
". " +
"Total number of partitions: " + maxPartitions)
}
val jobId =
nextJobId.getAndIncrement()
if (partitions.size == 0) {
// Return immediately if the job is running 0 tasks
return new JobWaiter[U](
this, jobId, 0, resultHandler)
}
assert(partitions.size > 0)
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
val waiter =
new JobWaiter(
this, jobId, partitions.size, resultHandler)
eventProcessLoop.post(
JobSubmitted(
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.
clone(properties)))
waiter
}
首先生成jobid,然后向dagScheduler事件队列发送消息eventProcessLoop.post
dagScheduler事件队列收到
JobSubmitted
消息处理函数handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
private[scheduler]
def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
callSite: CallSite,
listener: JobListener,
properties: Properties) {
var finalStage: ResultStage =
null
try {
// New stage creation may throw an exception if, for example, jobs are run on a
// HadoopRDD whose underlying HDFS files have been deleted.
finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
}
catch {
case e: Exception =>
logWarning(
"Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}
val job =
new ActiveJob(jobId, finalStage, callSite, listener, properties)
clearCacheLocs()
logInfo(
"Got job %s (%s) with %d output partitions".format(
job.jobId, callSite.shortForm, partitions.length))
logInfo(
"Final stage: " + finalStage +
" (" + finalStage.
name +
")")
logInfo(
"Parents of final stage: " + finalStage.parents)
logInfo(
"Missing parents: " + getMissingParentStages(finalStage))
val jobSubmissionTime = clock.getTimeMillis()
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.setActiveJob(job)
val stageIds =
jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id =>
stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
submitStage(finalStage)
}
handleJobSubmitted有3个工作:生成final stage、生成ActiveJob、提交finalstage。
(1)生成final stage
private def createResultStage(
rdd: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
jobId: Int,
callSite: CallSite): ResultStage = {
val parents = getOrCreateParentStages(rdd, jobId)
val id =
nextStageId.getAndIncrement()
val stage =
new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
stageIdToStage(id) = stage
updateJobIdStageIdMaps(jobId, stage)
stage
}
这个函数很重要,从这里可以看到
dagScheduler根据RDD DAG分析生成stage。stage是如何划分呢?就是判断是否存在ShuffleDependency,如果有的话就需要新建一个stage。
ShuffleRDD、CoGroupedRDD、SubtractedRDD会返回
ShuffleDependency,相关联的算子有sortByKey、 reduceByKey、 groupByKey、 cogroupByKey、 join、
cartesian。
private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
getShuffleDependencies(rdd).map { shuffleDep =>
getOrCreateShuffleMapStage(shuffleDep, firstJobId)
}.toList
}
注意这里是一次性把finalstage之前的所有stage都创建完了。
(2)生成ActiveJob
new ActiveJob(jobId, finalStage, callSite, listener, properties)
(3)提交finalstage
submitStage(finalStage)
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug(
"submitStage(" + stage +
")")
if (!
waitingStages(stage) && !
runningStages(stage) && !
failedStages(stage)) {
val missing = getMissingParentStages(stage).sortBy(_.id)
logDebug(
"missing: " + missing)
if (missing.isEmpty) {
logInfo(
"Submitting " + stage +
" (" + stage.rdd +
"), which has no missing parents")
submitMissingTasks(stage, jobId.get)
}
else {
for (parent <- missing) {
submitStage(parent)
}
waitingStages += stage
}
}
}
else {
abortStage(stage,
"No active job for stage " +
stage.id, None)
}
}
这个函数我们可以看出2个重要信息:一是所有stage都有3个状态(等待执行、执行中、执行失败),分别由3个集合保存;二是选取执行的stage是以result stage为根进行广度优先遍历找父stage,直到找到一个没有依赖stage的stage(最开始应该是找到树最底层的叶子stage),就开始执行。
2.task创建和分发
submitMissingTasks负责创建task。
根据前面的stage划分,stage分为ShuffleMapStage和ResultStage;对应的创建的task就是ShuffleMapTask和ResultTask。
看一个代码片段
val tasks: Seq[Task[_]] =
try {
val serializedTaskMetrics =
closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
stage
match {
case stage: ShuffleMapStage =>
stage.
pendingPartitions.clear()
partitionsToCompute.map { id =>
val locs = taskIdToLocations(id)
val part = stage.rdd.partitions(id)
stage.
pendingPartitions += id
new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, properties, serializedTaskMetrics,
Option(jobId),
Option(sc.applicationId), sc.applicationAttemptId)
}
case stage: ResultStage =>
partitionsToCompute.map { id =>
val p: Int = stage.partitions(id)
val part = stage.rdd.partitions(p)
val locs = taskIdToLocations(id)
new ResultTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, id, properties, serializedTaskMetrics,
Option(jobId),
Option(sc.applicationId), sc.applicationAttemptId)
}
}
}
在RDD窄依赖情况下,一个stage里面包含多个RDD,每个RDD有多个partition,从代码分析ShuffleMapStage每个partition都可以对应生成一个ShuffleMapTask,即按照partition可以做task并行处理。
taskScheduler.submitTasks(
new TaskSet(tasks.toArray,
stage.id, stage.latestInfo.attemptId, jobId, properties))
轮到taskScheduler上场了,将一个stage下的task归类到一个taskset,并提交这些tasks。
前面在分析application运行过程时,说明了taskScheduler是根据master和deploymode生成的。SparkContext.
createTaskScheduler(
this, master, deployMode), standalone是生成TaskSchedulerImpl,我们看看TaskSchedulerImpl submitTasks代码。
override def submitTasks(taskSet: TaskSet) {
val tasks = taskSet.tasks
logInfo(
"Adding task set " + taskSet.
id +
" with " + tasks.length +
" tasks")
this.synchronized {
val manager = createTaskSetManager(taskSet, maxTaskFailures)
val stage = taskSet.stageId
val stageTaskSets =
taskSetsByStageIdAndAttempt.getOrElseUpdate(stage,
new HashMap[Int, TaskSetManager])
stageTaskSets(taskSet.stageAttemptId) = manager
val conflictingTaskSet = stageTaskSets.exists {
case (_, ts) =>
ts.taskSet != taskSet && !ts.
isZombie
}
if (conflictingTaskSet) {
throw new IllegalStateException(
s"more than one active taskSet for stage
$stage
:" +
s"
${stageTaskSets.toSeq.map{_._2.taskSet.
id}.mkString(
",")}
")
}
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
if (!isLocal && !
hasReceivedTask) {
starvationTimer.scheduleAtFixedRate(
new TimerTask() {
override def run() {
if (!
hasLaunchedTask) {
logWarning(
"Initial job has not accepted any resources; " +
"check your cluster UI to ensure that workers are registered " +
"and have sufficient resources")
}
else {
this.cancel()
}
}
},
STARVATION_TIMEOUT_MS,
STARVATION_TIMEOUT_MS)
}
hasReceivedTask =
true
}
backend.reviveOffers()
}
主要功能创建createTaskSetManager,使用schedular backend发送ReviveOffers消息给Driver.