// Dataset.scala// scalastyle:off println
def show(numRows: Int, truncate: Boolean): Unit =if(truncate){println(showString(numRows, truncate =20))}else{println(showString(numRows, truncate =0))}private[sql] def showString(
_numRows: Int, truncate: Int =20, vertical: Boolean =false): String ={
val numRows = _numRows.max(0).min(Int.MaxValue -1)
val newDf =toDF()
val castCols = newDf.logicalPlan.output.map { col =>// Since binary types in top-level schema fields have a specific format to print,// so we do not cast them to strings here.if(col.dataType == BinaryType){Column(col)}else{Column(col).cast(StringType)}}// 这里take(n) 就是head(n)
val takeResult = newDf.select(castCols: _*).take(numRows +1)
val hasMoreData = takeResult.length > numRows
val data = takeResult.take(numRows)// For array values, replace Seq and Array with square brackets// For cells that are beyond `truncate` characters, replace it with the// first `truncate-3` and "..."
val rows: Seq[Seq[String]]= schema.fieldNames.toSeq +: data.map { row =>
row.toSeq.map { cell =>
val str = cell match {case null =>"null"case binary: Array[Byte]=> binary.map("%02X".format(_)).mkString("["," ","]")case _ => cell.toString
}if(truncate >0&& str.length > truncate){// do not show ellipses for strings shorter than 4 characters.if(truncate <4) str.substring(0, truncate)else str.substring(0, truncate -3)+"..."}else{
str
}}: Seq[String]}// head中执行 collectFromPlan
def head(n: Int): Array[T]=withAction("head",limit(n).queryExecution)(collectFromPlan)// Dataset.scala/**
* Collect all elements from a spark plan.
*/private def collectFromPlan(plan: SparkPlan): Array[T]={// This projection writes output to a `InternalRow`, which means applying this projection is not// thread-safe. Here we create the projection inside this method to make `Dataset` thread-safe.// 任务开始的时候会有一个CodeGen, deserializer 是一个Expression(如 CreateExternalRow) 对象,这里会先将这个CodeGen为Java Code
val objProj = GenerateSafeProjection.generate(deserializer :: Nil)// 这里的plan是 CollectLimitExec
plan.executeCollect().map { row =>// The row returned by SafeProjection is `SpecificInternalRow`, which ignore the data type// parameter of its `get` method, so it's safe to use null here.objProj(row).get(0, null).asInstanceOf[T]}}// CollectLimitExec
override def executeCollect(): Array[InternalRow]= child.executeTake(limit)// SparkPlan.scala /**
* Runs this query returning the first `n` rows as an array.
*
* This is modeled after `RDD.take` but never runs any job locally on the driver.
*/
def executeTake(n: Int): Array[InternalRow]={if(n ==0){returnnewArray[InternalRow](0)}
val childRDD =getByteArrayRdd(n).map(_._2)
val buf =newArrayBuffer[InternalRow]
val totalParts = childRDD.partitions.length
var partsScanned =0while(buf.size < n && partsScanned < totalParts){// The number of partitions to try in this iteration. It is ok for this number to be// greater than totalParts because we actually cap it at totalParts in runJob.
var numPartsToTry =1L
if(partsScanned >0){// If we didn't find any rows after the previous iteration, quadruple and retry.// Otherwise, interpolate the number of partitions we need to try, but overestimate// it by 50%. We also cap the estimation in the end.
val limitScaleUpFactor = Math.max(sqlContext.conf.limitScaleUpFactor,2)if(buf.isEmpty){
numPartsToTry = partsScanned * limitScaleUpFactor
}else{
val left = n - buf.size
// As left > 0, numPartsToTry is always >= 1
numPartsToTry = Math.ceil(1.5* left * partsScanned / buf.size).toInt
numPartsToTry = Math.min(numPartsToTry, partsScanned * limitScaleUpFactor)}}
val p = partsScanned.until(math.min(partsScanned + numPartsToTry, totalParts).toInt)
val sc = sqlContext.sparkContext
// 当前plan是一个subplan,sc将childRDD 通过函数调用进入 DAGScheduler.runJob() 方法
val res = sc.runJob(childRDD,(it: Iterator[Array[Byte]])=>if(it.hasNext) it.next()else Array.empty[Byte], p)
buf ++= res.flatMap(decodeUnsafeRows)
partsScanned += p.size
}if(buf.size > n){
buf.take(n).toArray
}else{
buf.toArray
}}
DAGScheduler.scala 提交Job 执行
def runJob[T, U](
rdd: RDD[T],
func:(TaskContext, Iterator[T])=> U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler:(Int, U)=> Unit,
properties: Properties): Unit ={
val start = System.nanoTime
val waiter =submitJob(rdd, func, partitions, callSite, resultHandler, properties)
ThreadUtils.awaitReady(waiter.completionFuture, Duration.Inf)
waiter.completionFuture.value.get match {case scala.util.Success(_)=>logInfo("Job %d finished: %s, took %f s".format
(waiter.jobId, callSite.shortForm,(System.nanoTime - start)/1e9))case scala.util.Failure(exception)=>logInfo("Job %d failed: %s, took %f s".format
(waiter.jobId, callSite.shortForm,(System.nanoTime - start)/1e9))// SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
val callerStackTrace = Thread.currentThread().getStackTrace.tail
exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)throw exception
}}
def submitJob[T, U](
rdd: RDD[T],
func:(TaskContext, Iterator[T])=> U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler:(Int, U)=> Unit,
properties: Properties): JobWaiter[U]={// Check to make sure we are not launching a task on a partition that does not exist.
val maxPartitions = rdd.partitions.length
partitions.find(p => p >= maxPartitions || p <0).foreach { p =>thrownewIllegalArgumentException("Attempting to access a non-existent partition: "+ p +". "+"Total number of partitions: "+ maxPartitions)}
val jobId = nextJobId.getAndIncrement()if(partitions.size ==0){// Return immediately if the job is running 0 tasksreturnnewJobWaiter[U](this, jobId,0, resultHandler)}assert(partitions.size >0)
val func2 = func.asInstanceOf[(TaskContext, Iterator[_])=> _]
val waiter =newJobWaiter(this, jobId, partitions.size, resultHandler)
eventProcessLoop.post(JobSubmitted(
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.clone(properties)))
waiter
}// 通过eventBus处理JobSubmitted 事件,还是来到 DAGScheduler.handleJobSubmitted中// 这里createResultStage 作为finalStage,并提交private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func:(TaskContext, Iterator[_])=> _,
partitions: Array[Int],
callSite: CallSite,
listener: JobListener,
properties: Properties){
var finalStage: ResultStage = null
try{// New stage creation may throw an exception if, for example, jobs are run on a// HadoopRDD whose underlying HDFS files have been deleted.
finalStage =createResultStage(finalRDD, func, partitions, jobId, callSite)}catch{case e: Exception =>logWarning("Creating new stage failed due to exception - job: "+ jobId, e)
listener.jobFailed(e)return}
val job =newActiveJob(jobId, finalStage, callSite, listener, properties)clearCacheLocs()logInfo("Got job %s (%s) with %d output partitions".format(
job.jobId, callSite.shortForm, partitions.length))logInfo("Final stage: "+ finalStage +" ("+ finalStage.name +")")logInfo("Parents of final stage: "+ finalStage.parents)logInfo("Missing parents: "+getMissingParentStages(finalStage))
val jobSubmissionTime = clock.getTimeMillis()jobIdToActiveJob(jobId)= job
activeJobs += job
finalStage.setActiveJob(job)
val stageIds =jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))submitStage(finalStage)}
根据Job创建Stage DAG图
// Stage的划分和创建,在创建ResultStage的时候就已经通过下面这几个方法把父stage创建出来了// createResultStage - getOrCreateParentStages - getShuffleDependencies - getOrCreateShuffleMapStage // // 这个过程中,shuffleIdToMapStage 这个变量应该会被update 掉// rdd.dependencies 这个会保存父 shuffle的shuffleId// shuffleIdToMapStage 这里会缓存 shuffleId 对应的Stage,虽然RDD 可能不会变化,但是rdd对应的Stage会更新/**
* Create a ResultStage associated with the provided jobId.
*/private def createResultStage(
rdd: RDD[_],
func:(TaskContext, Iterator[_])=> _,
partitions: Array[Int],
jobId: Int,
callSite: CallSite): ResultStage ={
val parents =getOrCreateParentStages(rdd, jobId)
val id = nextStageId.getAndIncrement()
val stage =newResultStage(id, rdd, func, partitions, parents, jobId, callSite)stageIdToStage(id)= stage
updateJobIdStageIdMaps(jobId, stage)
stage
}/**
* Get or create the list of parent stages for a given RDD. The new Stages will be created with
* the provided firstJobId.
*/private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage]={getShuffleDependencies(rdd).map { shuffleDep =>getOrCreateShuffleMapStage(shuffleDep, firstJobId)}.toList
}/**
* Returns shuffle dependencies that are immediate parents of the given RDD.
*
* This function will not return more distant ancestors. For example, if C has a shuffle
* dependency on B which has a shuffle dependency on A:
*
* A <-- B <-- C
*
* calling this function with rdd C will only return the B <-- C dependency.
*
* This function is scheduler-visible for the purpose of unit testing.
*/private[scheduler] def getShuffleDependencies(
rdd: RDD[_]): HashSet[ShuffleDependency[_, _, _]]={
val parents =newHashSet[ShuffleDependency[_, _, _]]
val visited =newHashSet[RDD[_]]
val waitingForVisit =newArrayStack[RDD[_]]
waitingForVisit.push(rdd)while(waitingForVisit.nonEmpty){
val toVisit = waitingForVisit.pop()if(!visited(toVisit)){
visited += toVisit
toVisit.dependencies.foreach {case shuffleDep: ShuffleDependency[_, _, _]=>
parents += shuffleDep
case dependency =>
waitingForVisit.push(dependency.rdd)}}}
parents
}/**
* Gets a shuffle map stage if one exists in shuffleIdToMapStage. Otherwise, if the
* shuffle map stage doesn't already exist, this method will create the shuffle map stage in
* addition to any missing ancestor shuffle map stages.
*/private def getOrCreateShuffleMapStage(
shuffleDep: ShuffleDependency[_, _, _],
firstJobId: Int): ShuffleMapStage ={
shuffleIdToMapStage.get(shuffleDep.shuffleId) match {caseSome(stage)=>
stage
case None =>// Create stages for all missing ancestor shuffle dependencies.getMissingAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>// Even though getMissingAncestorShuffleDependencies only returns shuffle dependencies// that were not already in shuffleIdToMapStage, it's possible that by the time we// get to a particular dependency in the foreach loop, it's been added to// shuffleIdToMapStage by the stage creation process for an earlier dependency. See// SPARK-13902 for more information.if(!shuffleIdToMapStage.contains(dep.shuffleId)){createShuffleMapStage(dep, firstJobId)}}// Finally, create a stage for the given shuffle dependency.createShuffleMapStage(shuffleDep, firstJobId)}}/** Submits stage, but first recursively submits any missing parents. */private def submitStage(stage: Stage){
val jobId =activeJobForStage(stage)if(jobId.isDefined){logDebug("submitStage("+ stage +")")if(!waitingStages(stage)&&!runningStages(stage)&&!failedStages(stage)){
val missing =getMissingParentStages(stage).sortBy(_.id)logDebug("missing: "+ missing)if(missing.isEmpty){logInfo("Submitting "+ stage +" ("+ stage.rdd +"), which has no missing parents")// 提交当前stage的所有task去执行submitMissingTasks(stage, jobId.get)}else{for(parent <- missing){// 将所有的父stage提交,并执行submitStage(parent)}
waitingStages += stage
}}}else{abortStage(stage,"No active job for stage "+ stage.id, None)}}// DAG 划分的核心private def getMissingParentStages(stage: Stage): List[Stage]={
val missing =newHashSet[Stage]
val visited =newHashSet[RDD[_]]// We are manually maintaining a stack here to prevent StackOverflowError// caused by recursively visiting
val waitingForVisit =newArrayStack[RDD[_]]
def visit(rdd: RDD[_]){if(!visited(rdd)){
visited += rdd
// x.3 在计算stage之前,还是要判断rdd的所有partitions是否已经都缓存,如果都已缓存,可以直接读取
val rddHasUncachedPartitions =getCacheLocs(rdd).contains(Nil)if(rddHasUncachedPartitions){for(dep <- rdd.dependencies){
dep match {// x.2 遍历所有父rdd,如果时宽依赖,创建一个新的stage,这里会依次递增生成新的 Stage Idcase shufDep: ShuffleDependency[_, _, _]=>
val mapStage =getOrCreateShuffleMapStage(shufDep, stage.firstJobId)// 根据ShuffleId获取到的Stage 可能会被更新,但是如果Stage中的RDD对应的数据已经被缓存,也不会被重复提交执行if(!mapStage.isAvailable){
missing += mapStage
}// 如果是窄依赖,入栈,等待继续遍历其父rddcase narrowDep: NarrowDependency[_]=>
waitingForVisit.push(narrowDep.rdd)}}}}}// x.1 lineage中最后一个rdd入栈
waitingForVisit.push(stage.rdd)while(waitingForVisit.nonEmpty){visit(waitingForVisit.pop())}
missing.toList
}/** Called when stage's parents are available and we can now do its task. */private def submitMissingTasks(stage: Stage, jobId: Int){logDebug("submitMissingTasks("+ stage +")")// First figure out the indexes of partition ids to compute.
val partitionsToCompute: Seq[Int]= stage.findMissingPartitions()// Use the scheduling pool, job group, description, etc. from an ActiveJob associated// with this Stage
val properties =jobIdToActiveJob(jobId).properties
runningStages += stage
// SparkListenerStageSubmitted should be posted before testing whether tasks are// serializable. If tasks are not serializable, a SparkListenerStageCompleted event// will be posted, which should always come after a corresponding SparkListenerStageSubmitted// event.
stage match {case s: ShuffleMapStage =>
outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions -1)case s: ResultStage =>
outputCommitCoordinator.stageStart(
stage = s.id, maxPartitionId = s.rdd.partitions.length -1)}
val taskIdToLocations: Map[Int, Seq[TaskLocation]]=try{
stage match {case s: ShuffleMapStage =>
partitionsToCompute.map { id =>(id,getPreferredLocs(stage.rdd, id))}.toMap
case s: ResultStage =>
partitionsToCompute.map { id =>
val p = s.partitions(id)(id,getPreferredLocs(stage.rdd, p))}.toMap
}}catch{caseNonFatal(e)=>
stage.makeNewStageAttempt(partitionsToCompute.size)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}",Some(e))
runningStages -= stage
return}
stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)// If there are tasks to execute, record the submission time of the stage. Otherwise,// post the even without the submission time, which indicates that this stage was// skipped.if(partitionsToCompute.nonEmpty){
stage.latestInfo.submissionTime =Some(clock.getTimeMillis())}
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast// the serialized copy of the RDD and for each task we will deserialize it, which means each// task gets a different copy of the RDD. This provides stronger isolation between tasks that// might modify state of objects referenced in their closures. This is necessary in Hadoop// where the JobConf/Configuration object is not thread-safe.
var taskBinary: Broadcast[Array[Byte]]= null
var partitions: Array[Partition]= null
try{// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).// For ResultTask, serialize and broadcast (rdd, func).
var taskBinaryBytes: Array[Byte]= null
// taskBinaryBytes and partitions are both effected by the checkpoint status. We need// this synchronization in case another concurrent job is checkpointing this RDD, so we get a// consistent view of both variables.
RDDCheckpointData.synchronized{
taskBinaryBytes = stage match {case stage: ShuffleMapStage =>
JavaUtils.bufferToArray(
closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))case stage: ResultStage =>
JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))}
partitions = stage.rdd.partitions
}
taskBinary = sc.broadcast(taskBinaryBytes)}catch{// In the case of a failure during serialization, abort the stage.case e: NotSerializableException =>abortStage(stage,"Task not serializable: "+ e.toString,Some(e))
runningStages -= stage
// Abort executionreturncaseNonFatal(e)=>abortStage(stage, s"Task serialization failed: $e\n${Utils.exceptionString(e)}",Some(e))
runningStages -= stage
return}
val tasks: Seq[Task[_]]=try{
val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
stage match {case stage: ShuffleMapStage =>
stage.pendingPartitions.clear()
partitionsToCompute.map { id =>
val locs =taskIdToLocations(id)
val part =partitions(id)
stage.pendingPartitions += id
newShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
taskBinary, part, locs, properties, serializedTaskMetrics,Option(jobId),Option(sc.applicationId), sc.applicationAttemptId)}case stage: ResultStage =>
partitionsToCompute.map { id =>
val p: Int = stage.partitions(id)
val part =partitions(p)
val locs =taskIdToLocations(id)newResultTask(stage.id, stage.latestInfo.attemptNumber,
taskBinary, part, locs, id, properties, serializedTaskMetrics,Option(jobId),Option(sc.applicationId), sc.applicationAttemptId)}}}catch{caseNonFatal(e)=>abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}",Some(e))
runningStages -= stage
return}if(tasks.size >0){logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 "+
s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
taskScheduler.submitTasks(newTaskSet(
tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))}else{// Because we posted SparkListenerStageSubmitted earlier, we should mark// the stage as completed here in case there are no tasks to runmarkStageAsFinished(stage, None)
stage match {case stage: ShuffleMapStage =>logDebug(s"Stage ${stage} is actually done; "+
s"(available: ${stage.isAvailable},"+
s"available outputs: ${stage.numAvailableOutputs},"+
s"partitions: ${stage.numPartitions})")markMapStageJobsAsFinished(stage)case stage : ResultStage =>logDebug(s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})")}submitWaitingChildStages(stage)}}// Stage 运行完毕,重复提交正在等待执行的stages/**
* Check for waiting stages which are now eligible for resubmission.
* Submits stages that depend on the given parent stage. Called when the parent stage completes
* successfully.
*/private def submitWaitingChildStages(parent: Stage){logTrace(s"Checking if any dependencies of $parent are now runnable")logTrace("running: "+ runningStages)logTrace("waiting: "+ waitingStages)logTrace("failed: "+ failedStages)
val childStages = waitingStages.filter(_.parents.contains(parent)).toArray
waitingStages --= childStages
for(stage <- childStages.sortBy(_.firstJobId)){submitStage(stage)}}
Stage执行结束
// Stage执行结束/**
* Marks a stage as finished and removes it from the list of running stages.
*/private def markStageAsFinished(
stage: Stage,
errorMessage: Option[String]= None,
willRetry: Boolean =false): Unit ={
val serviceTime = stage.latestInfo.submissionTime match {caseSome(t)=>"%.03f".format((clock.getTimeMillis()- t)/1000.0)case _ =>"Unknown"}if(errorMessage.isEmpty){logInfo("%s (%s) finished in %s s".format(stage, stage.name, serviceTime))
stage.latestInfo.completionTime =Some(clock.getTimeMillis())// Clear failure count for this stage, now that it's succeeded.// We only limit consecutive failures of stage attempts,so that if a stage is// re-used many times in a long-running job, unrelated failures don't eventually cause the// stage to be aborted.
stage.clearFailures()}else{
stage.latestInfo.stageFailed(errorMessage.get)logInfo(s"$stage (${stage.name}) failed in $serviceTime s due to ${errorMessage.get}")}if(!willRetry){
outputCommitCoordinator.stageEnd(stage.id)}
listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
runningStages -= stage
}//
override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit ={
val stageId = stageCompleted.stageInfo.stageId
allocationManager.synchronized{
stageIdToNumTasks -= stageId
stageIdToNumRunningTask -= stageId
stageIdToNumSpeculativeTasks -= stageId
stageIdToTaskIndices -= stageId
stageIdToSpeculativeTaskIndices -= stageId
stageIdToExecutorPlacementHints -= stageId
// Update the executor placement hintsupdateExecutorPlacementHints()// If this is the last stage with pending tasks, mark the scheduler queue as empty// This is needed in case the stage is aborted for any reasonif(stageIdToNumTasks.isEmpty && stageIdToNumSpeculativeTasks.isEmpty){
allocationManager.onSchedulerQueueEmpty()}}}
Task 的调度和执行
// 接 TaskSchedulerImpl.submitTasks(taskSet: TaskSet) 中 backend.reviveOffers()// CoarseGrainedSchedulerBackend.scala
override def reviveOffers(){
driverEndpoint.send(ReviveOffers)}// CoarseGrainedSchedulerBackend 在接收到 RegisterExecutor 和 ReviveOffers 时,会进行任务调度// CoarseGrainedSchedulerBackend.scala// Make fake resource offers on all executorsprivate def makeOffers(){// Make sure no executor is killed while some task is launching on it
val taskDescs = CoarseGrainedSchedulerBackend.this.synchronized{// Filter out executors under killing
val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
val workOffers = activeExecutors.map {case(id, executorData)=>newWorkerOffer(id, executorData.executorHost, executorData.freeCores)}.toIndexedSeq
scheduler.resourceOffers(workOffers)}if(!taskDescs.isEmpty){launchTasks(taskDescs)}}// TaskSchedulerImpl.scala/**
* Called by cluster manager to offer resources on slaves. We respond by asking our active task
* sets for tasks in order of priority. We fill each node with tasks in a round-robin manner so
* that tasks are balanced across the cluster.
*/
def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]]=synchronized{// Mark each slave as alive and remember its hostname// Also track if new executor is added
var newExecAvail =falsefor(o <- offers){if(!hostToExecutors.contains(o.host)){hostToExecutors(o.host)=newHashSet[String]()}if(!executorIdToRunningTaskIds.contains(o.executorId)){hostToExecutors(o.host)+= o.executorId
executorAdded(o.executorId, o.host)executorIdToHost(o.executorId)= o.host
executorIdToRunningTaskIds(o.executorId)= HashSet[Long]()
newExecAvail =true}for(rack <-getRackForHost(o.host)){
hostsByRack.getOrElseUpdate(rack,newHashSet[String]())+= o.host
}}// Before making any offers, remove any nodes from the blacklist whose blacklist has expired. Do// this here to avoid a separate thread and added synchronization overhead, and also because// updating the blacklist is only relevant when task offers are being made.
blacklistTrackerOpt.foreach(_.applyBlacklistTimeout())
val filteredOffers = blacklistTrackerOpt.map { blacklistTracker =>
offers.filter { offer =>!blacklistTracker.isNodeBlacklisted(offer.host)&&!blacklistTracker.isExecutorBlacklisted(offer.executorId)}}.getOrElse(offers)
val shuffledOffers =shuffleOffers(filteredOffers)// Build a list of tasks to assign to each worker.
val tasks = shuffledOffers.map(o =>newArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))
val availableCpus = shuffledOffers.map(o => o.cores).toArray
val sortedTaskSets = rootPool.getSortedTaskSetQueue
for(taskSet <- sortedTaskSets){logDebug("parentName: %s, name: %s, runningTasks: %s".format(
taskSet.parent.name, taskSet.name, taskSet.runningTasks))if(newExecAvail){
taskSet.executorAdded()}}// Take each TaskSet in our scheduling order, and then offer it each node in increasing order// of locality levels so that it gets a chance to launch local tasks on all of them.// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANYfor(taskSet <- sortedTaskSets){
var launchedAnyTask =false
var launchedTaskAtCurrentMaxLocality =falsefor(currentMaxLocality <- taskSet.myLocalityLevels){do{
launchedTaskAtCurrentMaxLocality =resourceOfferSingleTaskSet(
taskSet, currentMaxLocality, shuffledOffers, availableCpus, tasks)
launchedAnyTask |= launchedTaskAtCurrentMaxLocality
}while(launchedTaskAtCurrentMaxLocality)}if(!launchedAnyTask){
taskSet.abortIfCompletelyBlacklisted(hostToExecutors)}}if(tasks.size >0){
hasLaunchedTask =true}return tasks
}// TaskSetManager.scala 收到资源,并开始执行任务/**
* Respond to an offer of a single executor from the scheduler by finding a task
*
* NOTE: this function is either called with a maxLocality which
* would be adjusted by delay scheduling algorithm or it will be with a special
* NO_PREF locality which will be not modified
*
* @param execId the executor Id of the offered resource
* @param host the host Id of the offered resource
* @param maxLocality the maximum locality we want to schedule the tasks at
*/
@throws[TaskNotSerializableException]
def resourceOffer(
execId: String,
host: String,
maxLocality: TaskLocality.TaskLocality): Option[TaskDescription]={
val offerBlacklisted = taskSetBlacklistHelperOpt.exists { blacklist =>
blacklist.isNodeBlacklistedForTaskSet(host)||
blacklist.isExecutorBlacklistedForTaskSet(execId)}if(!isZombie &&!offerBlacklisted){
val curTime = clock.getTimeMillis()
var allowedLocality = maxLocality
if(maxLocality != TaskLocality.NO_PREF){
allowedLocality =getAllowedLocalityLevel(curTime)if(allowedLocality > maxLocality){// We're not allowed to search for farther-away tasks
allowedLocality = maxLocality
}}dequeueTask(execId, host, allowedLocality).map {case((index, taskLocality, speculative))=>// Found a task; do some bookkeeping and return a task description
val task =tasks(index)
val taskId = sched.newTaskId()// Do various bookkeepingcopiesRunning(index)+=1
val attemptNum =taskAttempts(index).size
val info =newTaskInfo(taskId, index, attemptNum, curTime,
execId, host, taskLocality, speculative)taskInfos(taskId)= info
taskAttempts(index)= info ::taskAttempts(index)// Update our locality level for delay scheduling// NO_PREF will not affect the variables related to delay schedulingif(maxLocality != TaskLocality.NO_PREF){
currentLocalityIndex =getLocalityIndex(taskLocality)
lastLaunchTime = curTime
}// Serialize and return the task
val serializedTask: ByteBuffer =try{
ser.serialize(task)}catch{// If the task cannot be serialized, then there's no point to re-attempt the task,// as it will always fail. So just abort the whole task-set.caseNonFatal(e)=>
val msg = s"Failed to serialize task $taskId, not attempting to retry it."logError(msg, e)abort(s"$msg Exception during serialization: $e")thrownewTaskNotSerializableException(e)}if(serializedTask.limit()> TaskSetManager.TASK_SIZE_TO_WARN_KB *1024&&!emittedTaskSizeWarning){
emittedTaskSizeWarning =truelogWarning(s"Stage ${task.stageId} contains a task of very large size "+
s"(${serializedTask.limit() / 1024} KB). The maximum recommended task size is "+
s"${TaskSetManager.TASK_SIZE_TO_WARN_KB} KB.")}addRunningTask(taskId)// We used to log the time it takes to serialize the task, but task size is already// a good proxy to task serialization time.// val timeTaken = clock.getTime() - startTime
val taskName = s"task ${info.id} in stage ${taskSet.id}"logInfo(s"Starting $taskName (TID $taskId, $host, executor ${info.executorId}, "+
s"partition ${task.partitionId}, $taskLocality, ${serializedTask.limit()} bytes)")
sched.dagScheduler.taskStarted(task, info)newTaskDescription(
taskId,
attemptNum,
execId,
taskName,
index,
addedFiles,
addedJars,
task.localProperties,
serializedTask)}}else{
None
}}/**
* Called by the TaskSetManager to report task's starting.
*/
def taskStarted(task: Task[_], taskInfo: TaskInfo){
eventProcessLoop.post(BeginEvent(task, taskInfo))}