spark源码走读（2）

最新推荐文章于 2018-11-02 16:03:22 发布

rongyongfeikai2

最新推荐文章于 2018-11-02 16:03:22 发布

阅读量595

点赞数

分类专栏： BIGDATA

本文链接：https://blog.csdn.net/rongyongfeikai2/article/details/50561049

版权

BIGDATA 专栏收录该内容

57 篇文章 0 订阅

订阅专栏

sc = SparkContext(master,appName)
主要是在SparkContext类中，会按照传递的参数；初始化SparkConf对象
val arr = Array(1,2,3,4,5)
val arr1 = sc.parallelize(arr)
进入SparkContext的parallelize方法：
def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = {
assertNotStopped()
new ParallelCollectionRDD[T](this, seq, numSlices, Map[Int, Seq[String]]())
}
初始化了一个ParallelCollectionRDD对象；ParallelCollectionRDD类是继承至RDD抽象类的；而RDD又继承至Serializable，Serializable继承至java.io.Serializable。这样就保证了RDD对象是可以序列化和反序列化的。

如果我们执行arr1.collect()
打印arr1中的所有内容，则进入了RDD的collect方法
可以看一下collect方法的定义：
def collect(): Array[T] = {
val results = sc.runJob(this, (iter: Iterator[T]) => iter.toArray)
Array.concat(results: _*)
}

直接进入的是SparkContext的runJob方法（第1450行）：
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
allowLocal: Boolean,
resultHandler: (Int, U) => Unit) {
if (stopped) {
throw new IllegalStateException("SparkContext has been shutdown")
}
//获得用户代码的类的名字
val callSite = getCallSite
//清理闭包
val cleanedFunc = clean(func)
//打印job相关信息
logInfo("Starting job: " + callSite.shortForm)
if (conf.getBoolean("spark.logLineage", false)) {
logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
}
//dagSchedule执行job
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, allowLocal,
resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
rdd.doCheckpoint()
}

dagScheduler.runJob方法：
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
allowLocal: Boolean,
resultHandler: (Int, U) => Unit,
properties: Properties = null)
{
val start = System.nanoTime
//提交job
val waiter = submitJob(rdd, func, partitions, callSite, allowLocal, resultHandler, properties)
//等待job执行结果
waiter.awaitResult() match {
case JobSucceeded => {
logInfo("Job %d finished: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
}
case JobFailed(exception: Exception) =>
logInfo("Job %d failed: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
throw exception
}
}

再进入submitJob方法看看：
def submitJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
allowLocal: Boolean,
resultHandler: (Int, U) => Unit,
properties: Properties = null): JobWaiter[U] =
{
// Check to make sure we are not launching a task on a partition that does not exist.
val maxPartitions = rdd.partitions.length
partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
throw new IllegalArgumentException(
"Attempting to access a non-existent partition: " + p + ". " +
"Total number of partitions: " + maxPartitions)
}

val jobId = nextJobId.getAndIncrement()
if (partitions.size == 0) {
return new JobWaiter[U](this, jobId, 0, resultHandler)
}

assert(partitions.size > 0)
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
//最关键的一句，利用eventProcessLoop提交Job
eventProcessLoop.post(JobSubmitted(
jobId, rdd, func2, partitions.toArray, allowLocal, callSite, waiter, properties))
waiter
}

eventProcessLoop是DAGSchedulerEventProcessLoop的对象，而DAGSchedulerEventProcessLoop继承至EventLoop
EventLoop中有一个线程：
private val eventThread = new Thread(name) {
setDaemon(true)

override def run(): Unit = {
try {
while (!stopped.get) {
val event = eventQueue.take()
try {
onReceive(event)
} catch {
case NonFatal(e) => {
try {
onError(e)
} catch {
case NonFatal(e) => logError("Unexpected error in " + name, e)
}
}
}
}
} catch {
case ie: InterruptedException => // exit even if eventQueue is not empty
case NonFatal(e) => logError("Unexpected error in " + name, e)
}
}
}

看起来应该是不断的从eventQueue中取得event，并且提交给onReceive方法。
onReceive方法取得了event对象后，判断它是JobSubmitted event，就触发了：
dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, allowLocal, callSite,
listener, properties)
方法。

下面来看看handlerJobSubmitted方法（DAGSchedule.scala）：
private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
allowLocal: Boolean,
callSite: CallSite,
listener: JobListener,
properties: Properties = null)
{
var finalStage: Stage = null
try {
// New stage creation may throw an exception if, for example, jobs are run on a
// HadoopRDD whose underlying HDFS files have been deleted.
finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)
} catch {
case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}
if (finalStage != null) {
val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
clearCacheLocs()
logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format(
job.jobId, callSite.shortForm, partitions.length, allowLocal))
logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
logInfo("Parents of final stage: " + finalStage.parents)
logInfo("Missing parents: " + getMissingParentStages(finalStage))
val shouldRunLocally =
localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1
val jobSubmissionTime = clock.getTimeMillis()
if (shouldRunLocally) {
// Compute very short actions like first() or take() with no parent stages locally.
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, Seq.empty, properties))
runLocally(job)
} else {
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.resultOfJob = Some(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
submitStage(finalStage)
}
}
submitWaitingStages()
}

rongyongfeikai2

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark源码走读（2）

sc = SparkContext(master,appName)主要是在SparkContext类中，会按照传递的参数；初始化SparkConf对象val arr = Array(1,2,3,4,5)val arr1 = sc.parallelize(arr)进入SparkContext的parallelize方法：def parallelize[T: ClassTag](s
复制链接

扫一扫