7. Job(initialized after SparkContext)
(1)一个Application可能包含多个Job,一个Action触发一个Job,运行完前一个Job,代码才会继续向下,触发下一个Job
(2)以wordCount为例:textFile -> flatMap(基类RDD中定义) -> map(基类RDD中定义) -> reduceByKey -> foreach
/**
* Read a text file from HDFS, a local file system (available on all nodes), or any
* Hadoop-supported file system URI, and return it as an RDD of Strings.(HadoopRDD.map -> MapPartionsRDD)
* @param path path to the text file on a supported file system
* @param minPartitions suggested minimum number of partitions for the resulting RDD
* @return RDD of lines of the text file
*/
def textFile(
path: String,
minPartitions: Int = defaultMinPartitions): RDD[String] = withScope {
assertNotStopped()
hadoopFile(path, classOf[TextInputFormat], classOf [LongWritable], classOf[Text],
minPartitions).map( pair => pair. _2. toString). setName( path) //pair是读取hdfs文件形成的键值对,<OFFSET,LINETEXT>
}
/** Get an RDD for a Hadoop file with an arbitrary InputFormat
*
* @note Because Hadoop's RecordReader class re-uses the same Writable object for each
* record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
* operation will create many references to the same object.
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
* copy them using a `map` function.
* @param path directory to the input data files, the path can be comma separated paths
* as a list of inputs
* @param inputFormatClass storage format of the data to be read
* @param keyClass `Class` of the key associated with the `inputFormatClass` parameter
* @param valueClass `Class` of the value associated with the `inputFormatClass` parameter
* @param minPartitions suggested minimum number of partitions for the resulting RDD
* @return RDD of tuples of key and corresponding value
*/
def hadoopFile[K, V](
path: String,
inputFormatClass: Class[_ <: InputFormat[K, V]],
keyClass: Class[ K],
valueClass: Class[V ],
minPartitions: Int = defaultMinPartitions): RDD[(K, V)] = withScope {
assertNotStopped()
// This is a hack to enforce loading hdfs-site.xml.
// See SPARK-11227 for details.
FileSystem.getLocal (hadoopConfiguration)
// A Hadoop configuration can be about 10 KB, which is pretty big, so broadcast it. 广播共享变量
val confBroadcast = broadcast( new SerializableConfiguration(hadoopConfiguration))
// 匿名函数,设置输入格式化
val setInputPathsFunc = ( jobConf: JobConf) => FileInputFormat.setInputPaths( jobConf, path)
// 创建初始RDD
new HadoopRDD(
this,
confBroadcast,
Some(setInputPathsFunc),
inputFormatClass,
keyClass,
valueClass,
minPartitions). setName( path)
}
//reduceByKey如何调用?RDD中没有定义,即调用时会触发隐式转换RDD -> pairRDDFunctions, 转换为pairRDDFunctions之后将调用reduceByKey
// The following implicit functions were in SparkContext before 1.3 and users had to
// `import SparkContext._` to enable them. Now we move them here to make the compiler find
// them automatically. However, we still keep the old functions in SparkContext for backward
// compatibility and forward to the following functions directly.
implicit def rddToPairRDDFunctions [K, V](rdd: RDD[(K, V)])
(implicit kt: ClassTag[ K], vt: ClassTag[ V], ord: Ordering[K ] = null): PairRDDFunctions[K, V] = {
new PairRDDFunctions(rdd )
}
/**
* Merge the values for each key using an associative and commutative reduce function. This will
* also perform the merging locally on each mapper before sending results to a reducer, similarly
* to a "combiner" in MapReduce.
*/
def reduceByKey( partitioner: Partitioner, func: ( V, V) => V): RDD[( K, V)] = self.withScope {
combineByKeyWithClassTag[V]((v: V) => v, func, func , partitioner)
}
// 最最最最最重要即Action: RDD.foreach
/**
* Applies a function f to all elements of this RDD.
*/
def foreach(f: T => Unit): Unit = withScope {
val cleanF = sc.clean(f)
sc.runJob(this, ( iter: Iterator[T ]) => iter.foreach(cleanF)) //将调用一系列重载RunJob
}
/**
* Run a function on a given set of partitions in an RDD and pass the results to the given
* handler function. This is the main entry point for all actions in Spark.
*
* @param rdd target RDD to run tasks on
* @param func a function to run on each partition of the RDD
* @param partitions set of partitions to run on; some jobs may not want to compute on all
* partitions of the target RDD, e.g. for operations like `first()`
* @param resultHandler callback to pass each result to
*/
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
resultHandler: (Int, U) => Unit): Unit = {
if (stopped.get()) {
throw new IllegalStateException("SparkContext has been shutdown" )
}
val callSite = getCallSite
val cleanedFunc = clean( func)
logInfo("Starting job: " + callSite .shortForm )
if (conf.getBoolean( "spark.logLineage", false )) {
logInfo("RDD's recursive dependencies: \n" + rdd.toDebugString)
}
// KEY: 调用SparkContext初始化时创建的DAGScheduler.runJob
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
progressBar. foreach(_. finishAll())
rdd.doCheckpoint()
}
(3)DAGScheduler
[1] Stage划分
【防裂说明】DAGScheduler的Stage划分原理
从触发Job边界的action操作的RDD开始向前倒数,首先为最后一个RDD创建一个Stage。在继续向前倒推的过程中,若发现对某个RDD是宽依赖,则将创建一个新的Stage,那个宽依赖RDD就是新Stage的最后一个RDD,否则继续。以此类推继续向前,直至遍历完所有的RDD。一句话总结:宽依赖是Stage的分界。
/**
* Run an action job on the given RDD and pass all the results to the resultHandler function as
* they arrive.
*
* @param rdd target RDD to run tasks on
* @param func a function to run on each partition of the RDD
* @param partitions set of partitions to run on; some jobs may not want to compute on all
* partitions of the target RDD, e.g. for operations like first()
* @param callSite where in the user program this job was called
* @param resultHandler callback to pass each result to
* @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
*
* @note Throws `Exception` when the job fails
*/
def runJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): Unit = {
val start = System. nanoTime
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
ThreadUtils.awaitReady (waiter .completionFuture , Duration.Inf)
waiter.completionFuture. value. get match {
case scala.util. Success(_) =>
logInfo("Job %d finished: %s, took %f s" .format
( waiter. jobId, callSite. shortForm, (System.nanoTime - start ) / 1e9 ))
case scala.util. Failure(exception ) =>
logInfo("Job %d failed: %s, took %f s" .format
( waiter. jobId, callSite. shortForm, (System.nanoTime - start ) / 1e9 ))
// SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
val callerStackTrace = Thread.currentThread(). getStackTrace.tail
exception.setStackTrace (exception.getStackTrace ++ callerStackTrace)
throw exception
}
}
/**
* Submit an action job to the scheduler.
*
* @param rdd target RDD to run tasks on
* @param func a function to run on each partition of the RDD
* @param partitions set of partitions to run on; some jobs may not want to compute on all
* partitions of the target RDD, e.g. for operations like first()
* @param callSite where in the user program this job was called
* @param resultHandler callback to pass each result to
* @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
*
* @return a JobWaiter object that can be used to block until the job finishes executing
* or can be used to cancel the job.
*
* @throws IllegalArgumentException when partitions ids are illegal
*/
def submitJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): JobWaiter[U] = {
// Check to make sure we are not launching a task on a partition that does not exist.
val maxPartitions = rdd. partitions.length
partitions.find(p => p >= maxPartitions || p < 0). foreach { p =>
throw new IllegalArgumentException (
"Attempting to access a non-existent partition: " + p + ". " +
"Total number of partitions: " + maxPartitions )
}
val jobId = nextJobId.getAndIncrement()
if (partitions.size == 0) {
// Return immediately if the job is running 0 tasks
return new JobWaiter[U]( this, jobId , 0 , resultHandler )
}
assert(partitions.size > 0)
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
//KEY
eventProcessLoop.post(JobSubmitted(
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.clone (properties )))
waiter
}
private def doOnReceive (event : DAGSchedulerEvent ): Unit = event match {
case JobSubmitted(jobId , rdd , func , partitions , callSite , listener , properties) =>
dagScheduler. handleJobSubmitted(jobId , rdd , func , partitions , callSite , listener, properties )
//...
}
/**
* *DAGScheduler调度算法的核心入口*
*/
private[scheduler] def handleJobSubmitted (jobId : Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
callSite: CallSite,
listener: JobListener,
properties: Properties) {
//使用调用action操作触发Job的最后一个RDD,创建finalStage
var finalStage: ResultStage = null
try {
// New stage creation may throw an exception if, for example, jobs are run on a
// HadoopRDD whose underlying HDFS files have been deleted.
finalStage = createResultStage(finalRDD , func , partitions , jobId , callSite )
} catch {
case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId , e)
listener. jobFailed(e)
return
}
//用finalStage创建一个Job
val job = new ActiveJob(jobId , finalStage , callSite , listener , properties )
clearCacheLocs()
logInfo("Got job %s (%s) with %d output partitions" .format(
job.jobId, callSite.shortForm, partitions.length))
logInfo("Final stage: " + finalStage + " (" + finalStage .name + ")" )
logInfo("Parents of final stage: " + finalStage .parents )
logInfo("Missing parents: " + getMissingParentStages(finalStage))
val jobSubmissionTime = clock.getTimeMillis()
//将Job加入内存缓存中
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.setActiveJob( job)
val stageIds = jobIdToStageIds( jobId). toArray
val stageInfos = stageIds.flatMap (id => stageIdToStage.get (id).map(_. latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
//提交finalStage
// 效果:将提交stage0,并且将其他所有的stage放入waitingstage队列
submitStage( finalStage)
}
/**
* Create a ResultStage associated with the provided jobId.
*/
private def createResultStage (
rdd: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
jobId: Int,
callSite: CallSite): ResultStage = {
val parents = getOrCreateParentStages( rdd, jobId)
val id = nextStageId. getAndIncrement()
val stage = new ResultStage(id , rdd , func , partitions , parents , jobId , callSite)
//加入到内存缓存中
stageIdToStage(id) = stage
updateJobIdStageIdMaps( jobId, stage)
stage
}
/** Submits stage, but first recursively submits any missing parents. */
private def submitStage (stage : Stage) {
val jobId = activeJobForStage( stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")" )
if (! waitingStages(stage ) && !runningStages (stage ) && !failedStages (stage )) {
//KEY:获取当前stage的父stage
val missing = getMissingParentStages (stage ).sortBy (_.id )
logDebug("missing: " + missing )
// submitStage()将递归调用多次,直至最初的stage(没有父stage),才会从前往后加入waitingStages
if (missing .isEmpty ) {
logInfo("Submitting " + stage + " (" + stage .rdd + "), which has no missing parents")
submitMissingTasks(stage , jobId .get )
} else {
//【划分算法精髓】递归调用submitStage()提交父stage
for (parent <- missing ) {
submitStage(parent )
}
//将当前stage加入waitingStages等待队列
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage. id, None)
}
}
private def getMissingParentStages (stage : Stage): List[Stage] = {
val missing = new HashSet[Stage]
val visited = new HashSet[RDD[_]]
// We are manually maintaining a stack here to prevent StackOverflowError
// caused by recursively visiting
// 先进后出,逆转倒推
val waitingForVisit = new ArrayStack[RDD[_]]
def visit(rdd: RDD[_]) {
if (! visited( rdd)) {
visited += rdd
val rddHasUncachedPartitions = getCacheLocs (rdd ).contains (Nil)
if (rddHasUncachedPartitions ) {
//遍历当前RDD依赖
for (dep <- rdd .dependencies ) {
dep match {
// 宽依赖:使用宽依赖RDD创建ShuffleMapStage,默认finalStage不是ShuffleMapStage,但其之前的都是
case shufDep : ShuffleDependency [_, _, _] =>
val mapStage = getOrCreateShuffleMapStage (shufDep , stage.firstJobId )
if (!mapStage .isAvailable ) {
missing += mapStage
}
// 窄依赖 : 窄依赖RDD压栈,此时watingForVisit栈不会为空,继续调用visit
case narrowDep : NarrowDependency[_] =>
waitingForVisit.push (narrowDep .rdd )
}
}
}
}
}
//首先,向栈中压入stage的最后一个RDD,循环弹栈,调用内部定义的函数visit
waitingForVisit. push( stage. rdd)
while ( waitingForVisit.nonEmpty ) {
visit(waitingForVisit. pop())
}
// 返回发现的新Stage
missing.toList
}
/**
* Gets a shuffle map stage if one exists in shuffleIdToMapStage. Otherwise, if the
* shuffle map stage doesn't already exist, this method will create the shuffle map stage in
* addition to any missing ancestor shuffle map stages.
*/
private def getOrCreateShuffleMapStage (
shuffleDep: ShuffleDependency[_, _, _],
firstJobId: Int): ShuffleMapStage = {
shuffleIdToMapStage.get(shuffleDep.shuffleId) match {
case Some(stage ) =>
stage
case None =>
// Create stages for all missing ancestor shuffle dependencies.
getMissingAncestorShuffleDependencies(shuffleDep .rdd ).foreach { dep =>
// Even though getMissingAncestorShuffleDependencies only returns shuffle dependencies
// that were not already in shuffleIdToMapStage, it's possible that by the time we
// get to a particular dependency in the foreach loop, it's been added to
// shuffleIdToMapStage by the stage creation process for an earlier dependency. See
// SPARK-13902 for more information.
if (!shuffleIdToMapStage.contains(dep.shuffleId)) {
createShuffleMapStage(dep , firstJobId )
}
}
// Finally, create a stage for the given shuffle dependency.
createShuffleMapStage(shuffleDep , firstJobId )
}
}
【总结】
<1> 从finalStage倒推
<2> 通过宽依赖进行新stage划分
<3> 使用递归优先提交父stage,逆转倒退
[2] Stage的并行Task最佳位置计算
/** Called when stage's parents are available and we can now do its task. */
private def submitMissingTasks (stage : Stage, jobId: Int) {
logDebug("submitMissingTasks(" + stage + ")" )
// First figure out the indexes of partition ids to compute.
val partitionsToCompute: Seq[Int] = stage .findMissingPartitions ()
// Use the scheduling pool, job group, description, etc. from an ActiveJob associated
// with this Stage
val properties = jobIdToActiveJob(jobId). properties
runningStages += stage
// SparkListenerStageSubmitted should be posted before testing whether tasks are
// serializable. If tasks are not serializable, a SparkListenerStageCompleted event
// will be posted, which should always come after a corresponding SparkListenerStageSubmitted
// event.
stage match {
case s: ShuffleMapStage =>
outputCommitCoordinator.stageStart( stage = s.id, maxPartitionId = s.numPartitions - 1)
case s: ResultStage =>
outputCommitCoordinator.stageStart(
stage = s.id, maxPartitionId = s.rdd.partitions.length - 1 )
}
val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
stage match {
case s : ShuffleMapStage =>
partitionsToCompute.map { id => (id , getPreferredLocs (stage .rdd , id))}.toMap
case s : ResultStage =>
partitionsToCompute.map { id =>
val p = s .partitions (id )
( id, getPreferredLocs(stage .rdd , p ))
}. toMap
}
} catch {
case NonFatal( e) =>
stage. makeNewStageAttempt(partitionsToCompute.size)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
abortStage(stage , s"Task creation failed: $e\n$ {Utils. exceptionString(e )}" , Some(e ))
runningStages -= stage
return
}
stage.makeNewStageAttempt( partitionsToCompute.size , taskIdToLocations.values .toSeq )
// If there are tasks to execute, record the submission time of the stage. Otherwise,
// post the even without the submission time, which indicates that this stage was
// skipped.
if (partitionsToCompute. nonEmpty) {
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
}
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
// TODO : Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
// the serialized copy of the RDD and for each task we will deserialize it, which means each
// task gets a different copy of the RDD. This provides stronger isolation between tasks that
// might modify state of objects referenced in their closures. This is necessary in Hadoop
// where the JobConf/Configuration object is not thread-safe.
var taskBinary: Broadcast[Array[Byte]] = null
var partitions: Array[Partition] = null
try {
// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
// For ResultTask, serialize and broadcast (rdd, func).
var taskBinaryBytes: Array[Byte] = null
// taskBinaryBytes and partitions are both effected by the checkpoint status. We need
// this synchronization in case another concurrent job is checkpointing this RDD, so we get a
// consistent view of both variables.
RDDCheckpointData.synchronized {
taskBinaryBytes = stage match {
case stage : ShuffleMapStage =>
JavaUtils.bufferToArray(
closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
case stage : ResultStage =>
JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
}
partitions = stage .rdd .partitions
}
taskBinary = sc. broadcast(taskBinaryBytes )
} catch {
// In the case of a failure during serialization, abort the stage.
case e: NotSerializableException =>
abortStage(stage , "Task not serializable: " + e .toString , Some(e))
runningStages -= stage
// Abort execution
return
case NonFatal( e) =>
abortStage(stage , s"Task serialization failed: $e\n$ {Utils. exceptionString(e )}" , Some(e ))
runningStages -= stage
return
}
//为stage创建指定数量的Task
val tasks: Seq[Task[_]] = try {
val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array ()
stage match {
//1. 给每一个partition创建一个Task
//2. 给每个Task计算最佳位置
case stage : ShuffleMapStage =>
stage.pendingPartitions.clear()
partitionsToCompute.map { id =>
val locs = taskIdToLocations (id ) //底层关键:getPreferredLocs 最佳位置算法
val part = partitions (id )
stage.pendingPartitions += id
//3. 创建ShuffleMapTask
new ShuffleMapTask( stage. id, stage. latestInfo.attemptNumber ,
taskBinary, part , locs , properties , serializedTaskMetrics , Option(jobId),
Option(sc.applicationId), sc.applicationAttemptId)
}
case stage : ResultStage =>
partitionsToCompute.map { id =>
val p : Int = stage .partitions (id )
val part = partitions (p )
val locs = taskIdToLocations (id )
//4. 创建ResultTask
new ResultTask( stage. id, stage. latestInfo.attemptNumber ,
taskBinary, part , locs , id , properties , serializedTaskMetrics ,
Option( jobId), Option(sc.applicationId), sc.applicationAttemptId)
}
}
} catch {
case NonFatal( e) =>
abortStage(stage , s"Task creation failed: $e\n$ {Utils. exceptionString(e )}" , Some(e ))
runningStages -= stage
return
}
if (tasks.size > 0) {
logInfo(s"Submitting $ {tasks .size } missing tasks from $ stage (${stage.rdd}) (first 15 " +
s "tasks are for partitions ${tasks .take (15 ).map (_.partitionId )})" )
//针对该Stage的Task,创建TaskSet,调用TaskScheduler的submitTask()提交TaskSet
//默认情况下,standalone模式,使用TaskSchedulerImpl,TaskSchduler只是一个Trait
taskScheduler.submitTasks( new TaskSet(
tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
} else {
// Because we posted SparkListenerStageSubmitted earlier, we should mark
// the stage as completed here in case there are no tasks to run
markStageAsFinished( stage, None)
val debugString = stage match {
case stage : ShuffleMapStage =>
s "Stage ${stage } is actually done; " +
s "(available: ${stage .isAvailable} ," +
s "available outputs: ${stage .numAvailableOutputs} ," +
s "partitions: ${stage .numPartitions })"
case stage : ResultStage =>
s "Stage ${stage } is actually done; (partitions: $ {stage .numPartitions })"
}
logDebug(debugString )
submitWaitingChildStages( stage)
}
}
/**
* Recursive implementation for getPreferredLocs.
*
* This method is thread-safe because it only accesses DAGScheduler state through thread-safe
* methods (getCacheLocs()); please be careful when modifying this method, because any new
* DAGScheduler state accessed by it may require additional synchronization.
*
* Task的最佳位置,就是缓存/CheckPoint的partition位置,因为Task就在该节点上执行,不需要计算之前的RDD
*/
private def getPreferredLocsInternal (
rdd: RDD[_],
partition: Int,
visited: HashSet[(RDD[_], Int)]): Seq[TaskLocation] = {
// If the partition has already been visited, no need to re-visit.
// This avoids exponential path exploration. SPARK-695
if (! visited. add(( rdd, partition))) {
// Nil has already been returned for previously visited partitions.
return Nil
}
// If the partition is cached, return the cache locations
val cached = getCacheLocs( rdd)( partition)
if (cached.nonEmpty) {
return cached
}
// If the RDD has some placement preferences (as is the case for input RDDs), get those (==CheckPoint)
val rddPrefs = rdd.preferredLocations( rdd. partitions(partition )).toList
if (rddPrefs.nonEmpty) {
return rddPrefs. map( TaskLocation(_))
}
// If the RDD has narrow dependencies, pick the first partition of the first narrow dependency
// that has any placement preferences. Ideally we would choose based on transfer sizes,
// but this will do for now.
rdd.dependencies. foreach {
case n: NarrowDependency[_] =>
for (inPart <- n .getParents (partition)) {
val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
if (locs != Nil) {
return locs
}
}
case _ =>
}
// 如果该stage从最后一个RDD倒推至最开始的RDD都没有被缓存或checkpoint,那么Task的最佳位置(preferrdLocs)即为Nil,需要TaskScheduler分配
Nil
}
[3] 追踪记录Task
[4] Spark底层核心组件:BlockManagerMaster内存数据缓存管理