now we will dive into spark internal as per this simple example(wordcount,later articles will reference this one by default) below
sparkConf.setMaster("local[2]") //-local[*] by default
//leib-confs:output all the dependencies logs
sparkConf.set("spark.logLineage","true")
val sc = new org.apache.spark.SparkContext(sparkConf)
// (1) ShuffledRDD[4] at reduceByKey at ScalaWordCount.scala:44 []
// +-(1) MapPartitionsRDD[3] at map at ScalaWordCount.scala:42 []
// | MapPartitionsRDD[2] at flatMap at ScalaWordCount.scala:41 []
// | MapPartitionsRDD[1] at textFile at ScalaWordCount.scala:34 []
// | ../spark-1.4.1/examples/src/main/resources/CHANGES.txt HadoopRDD[0] at textFile at ScalaWordCount.scala:34 []
val rdd = sc.textFile(file) //MapPartitionsRDD[1]-->HadoopRDD[0] examples/src/....Changes.txt
val fmrdd = rdd.flatMap(_.split(" ")) //-MapPartitionsRDD[2]
val maprdd = fmrdd.map((_,1)) //-MapPartitionsRDD[3]
//-this file is 584k
val rstrdd = maprdd.reduceByKey((x,y) => x+y) //-ShuffledRDD[4]; reduceByKey() will not spawn a real computating event
val arr = rstrdd.collect() //-action now,but collect() does
val max = 10 //Integer.MAX_VALUE
var count = 0
if(arr.length > max){ //-if want to limit the result size,use rdd.take(n) or rdd.top(n) is better
println(s"*reduce output to limit ${max},found ${arr.length}")
}
//itereately without order
for((ele,num) <- arr if(count < max)){
count += 1
println(ele + "," + num)
}
overview of job submitting flow
communication figure of a job
sequence figure in map side computation
pseudo code view that corresponding to spark src
======
several kernel code paths
-job submitting method in DAGScheduler
/**-handle job submit event */
private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _, //-case 1:spark internal collective func,ie it.toArray()
partitions: Array[Int], //-from most recent rdd's partitions,see SparkContext#runJob()-L1979
allowLocal: Boolean,
callSite: CallSite,
listener: JobListener,
properties: Properties) {
var finalStage: ResultStage = null
try {
// New stage creation may throw an exception if, for example, jobs are run on a
// HadoopRDD whose underlying HDFS files have been deleted.-->>parent ShuffleMapStage is created here<<
finalStage = newResultStage(finalRDD, partitions.size, jobId, callSite)
} catch {
case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}
if (finalStage != null) {
val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties) //-one job per action
clearCacheLocs()
logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format(
job.jobId, callSite.shortForm, partitions.length, allowLocal))
logInfo("Final stage: " + finalStage + "(" + finalStage.name + "),rdd " + finalRDD)//-ShuffleRDD for ScalaWordCount
logInfo("Parents of final stage: " + finalStage.parents)
logInfo("Missing parents: " + getMissingParentStages(finalStage))
val shouldRunLocally =
localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1
val jobSubmissionTime = clock.getTimeMillis()
if (shouldRunLocally) {
// Compute very short actions like first() or take() with no parent stages locally.
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, Seq.empty, properties))
runLocally(job) //-run with same action's process in local host; mini spark job runner:no job/task schedule
} else { //-eg. reduceByKey()
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.resultOfJob = Some(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo)) //-one element in fact
listenerBus.post( //-below is similar to runLocally()
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
submitStage(finalStage) //-ResultStage如何提交?see into;一个job的各个map操作如何完成?在RDD#compute()中有迭代iterator()
}
}
submitWaitingStages() //-check whether any waiting stages to submit
}
/** Submits stage, but first *recursively* submits any missing parents.-ie. from earlier to later by executed order */
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logInfo("*submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
val missing = getMissingParentStages(stage).sortBy(_.id) //-from old steps to new; tasks belong to this stage
logInfo("-*missing: " + missing)
if (missing.isEmpty) { //-submit the root/first stages only
logInfo("--*Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
submitMissingTasks(stage, jobId.get) //-this is first stage,so submit all tasks w/o dependencies
} else {
for (parent <- missing) { //-not leaf node,recursively calling self
submitStage(parent)
}
//-resultstage is set here
//-exists parent stage,so add thsi stage to wating queue.it will be scheduled after the prior stages finished,
waitingStages += stage //-submit stage by stage,so keep child stage here;,see this.onReceive()>ComplementEvent
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id)
}
generates DAG then split stage to tasks,submits tasks at last below
/** Called when stage's parents are available and we can now do its task. */
private def submitMissingTasks(stage: Stage, jobId: Int) {
logDebug("submitMissingTasks(" + stage + ")")
// Get our pending tasks and remember them in our pendingTasks entry
stage.pendingTasks.clear()
// First figure out the indexes of partition ids to compute.-empty or unfinished partitions
val partitionsToCompute: Seq[Int] = { //-control how many tasks will be generated
stage match {
case stage: ShuffleMapStage =>
(0 until stage.numPartitions).filter(id => stage.outputLocs(id).isEmpty)
case stage: ResultStage =>
val job = stage.resultOfJob.get
(0 until job.numPartitions).filter(id => !job.finished(id))
}
}
val properties = jobIdToActiveJob.get(stage.jobId).map(_.properties).orNull
//-mark current running stage
runningStages += stage
// SparkListenerStageSubmitted should be posted before testing whether tasks are
// serializable. If tasks are not serializable, a SparkListenerStageCompleted event
// will be posted, which should always come after a corresponding SparkListenerStageSubmitted
// event.
stage.latestInfo = StageInfo.fromStage(stage, Some(partitionsToCompute.size))
outputCommitCoordinator.stageStart(stage.id)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
// the serialized copy of the RDD and for each task we will deserialize it, which means each
// task gets a different copy of the RDD. This provides stronger isolation between tasks that
// might modify state of objects referenced in their closures. This is necessary in Hadoop
// where the JobConf/Configuration object is not thread-safe.
var taskBinary: Broadcast[Array[Byte]] = null
try {
// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
// For ResultTask, serialize and broadcast (rdd, func).
val taskBinaryBytes: Array[Byte] = stage match {
case stage: ShuffleMapStage =>
closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array() //-use stage's rdd as task's rdd
case stage: ResultStage =>
closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func): AnyRef).array()
}
taskBinary = sc.broadcast(taskBinaryBytes)
} catch {
// In the case of a failure during serialization, abort the stage.
case e: NotSerializableException =>
abortStage(stage, "Task not serializable: " + e.toString)
runningStages -= stage
// Abort execution
return
case NonFatal(e) =>
abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}")
runningStages -= stage
return
}
//-note:tasks generation policy
val tasks: Seq[Task[_]] = try {
stage match {
case stage: ShuffleMapStage => //-intermediate stage,see below
partitionsToCompute.map { id => //-partitioned by dep-rdd's partitions( maybe deliver to root rdd's partitions)
val locs = getPreferredLocs(stage.rdd, id)//-several locations belong to the partition,similar to hdfs blocks
val part = stage.rdd.partitions(id) //-access the indexed id partition
new ShuffleMapTask(stage.id, taskBinary, part, locs) //-corresponding task
}
case stage: ResultStage => //-final stage
val job = stage.resultOfJob.get
partitionsToCompute.map { id =>
val p: Int = job.partitions(id) //-a map from job partition to stage.rdd's one
val part = stage.rdd.partitions(p)
val locs = getPreferredLocs(stage.rdd, p) //- the replication hosts of same block?yes
//-track info
for(par <- stage.rdd.partitions)
logInfo("-part/rdd:" + par + "/" + stage.rdd)
for(loc <- locs)
logInfo("-loc %s".format(loc))
//-# resulttask is depended on partitions num
new ResultTask(stage.id, taskBinary, part, locs, id) //-corresponding task
}
}
} catch {
case NonFatal(e) =>
abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}")
runningStages -= stage
return
}
if (tasks.size > 0) { //-case reduceByKey(), 1 for it
logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
stage.pendingTasks ++= tasks
logDebug("New pending tasks: " + stage.pendingTasks)
taskScheduler.submitTasks( //-deliver tasks schedule to TaskSchedulerImpl
new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
} else {
// Because we posted SparkListenerStageSubmitted earlier, we should mark
// the stage as completed here in case there are no tasks to run
markStageAsFinished(stage, None)
val debugString = stage match {
case stage: ShuffleMapStage =>
s"Stage ${stage} is actually done; " +
s"(available: ${stage.isAvailable}," +
s"available outputs: ${stage.numAvailableOutputs}," +
s"partitions: ${stage.numPartitions})"
case stage : ResultStage =>
s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"
}
logDebug(debugString)
}
-ShuffleMapTask core method
override def runTask(context: TaskContext): MapStatus = {
// Deserialize the RDD using the broadcast variable.
val deserializeStartTime = System.currentTimeMillis()
val ser = SparkEnv.get.closureSerializer.newInstance()
//-task serialization see DAGScheduler#submitMissingTasks(); the rdd is last rdd in this stage(shufflemapstage)
val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])]( //-restore the direct-recent parent rdd
ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
_executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
metrics = Some(context.taskMetrics)
var writer: ShuffleWriter[Any, Any] = null
try {
val manager = SparkEnv.get.shuffleManager //-SortShuffleManager by default
//'SortShuffleWriter'
writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context) //-'BaseShuffleHandler'
//-first,compute the user's lastest job business(eg. reduceByKey()) by rdd.iterator(xx);then output the result
logInfo("precomputing,task:" + toString + ",dep:" + dep
+ ",handle:" + dep.shuffleHandle +",part:" +partitionId + ",rdd " + rdd)
writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])//-invoke recusively
return writer.stop(success = true).get //-release resources then return MapStatus,see SortShuffleWriter#write()
} catch {
case e: Exception =>
try {
if (writer != null) {
writer.stop(success = false)
}
} catch {
case e: Exception =>
log.debug("Could not stop writer", e)
}
throw e
}
}