前言
本篇主要的目的是阐述 SparkContext 初始化过程中,重要环节源码的逻辑流程的梳理。SparkContext 实例化的过程是一个比较复杂的过程,主要包括SparkEnv、spark-history、心跳检测、状态追踪、广播、资源与任务调度、底层通信等等组件的初始化。本篇文章的主要目的在于:了解 Spark 提交任务底层是怎样将 Task 任务提交至 Executor 中;了解 SparkContext 初始化过程中几个重要的环节。
Spark 任务提交大致流程
流程说明
- 经过 SparkSubmit 的提交之后就会准备运行环境(包括:资源、SparkContext初始化等等),然后开始提交 Spark Job 任务运行;
- 当遇到动作算子(如:reduce、collect等等)的时候就会开始提交 Job,这一类算的底层一定会执行
runJob()方法
并提交到 DAGschedule中; - 而后通过 DAGSchedulerEventProcessLoop(循环事务处理器)处理提交 Job,然后开始递归分段提交任务至 TaskSchedule(实际为继承的子类 TaskSchedulerImpl);
- 实际上到了这一步 Task 任务已经切分好了,并且准备提交给 Executor 执行;但是怎么直接将 Task 提交给Executor运行;,这里就需要 ScheduleBackend (后端调度器)将 Task 分发给各个 Executor;
- ScheduleBackend (实际为继承的子类 CoarseGrainedSchedulerBackend)开始通过 launchTasks 提交 Task任务至 Executor 中;
- ExecutorBackend (实际为继承的子类 CoarseGrainedExecutorBackend)接收到 Task 任务时,就会提交给 Executor 开始运行;
- Executor 接收到 Task任务时候就会开始执行;
涉及源码说明
执行Job
org.apache.spark.rdd.RDD
// 这里是以 reduce 算子为例进行说明
def reduce(f: (T, T) => T): T = withScope {
val cleanF = sc.clean(f)
................. 中间省略部分代码 ....................
// 通过SparkContext 开始提交Job
sc.runJob(this, reducePartition, mergeResult)
// Get the final result out of our Option, or throw an exception if the RDD was empty
jobResult.getOrElse(throw new UnsupportedOperationException("empty collection"))
}
org.apache.spark.SparkContext
def runJob[T, U: ClassTag](
rdd: RDD[T],
processPartition: Iterator[T] => U,
resultHandler: (Int, U) => Unit)
{
val processFunc = (context: TaskContext, iter: Iterator[T]) => processPartition(iter)
runJob[T, U](rdd, processFunc, 0 until rdd.partitions.length, resultHandler)
}
org.apache.spark.SparkContext
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
resultHandler: (Int, U) => Unit): Unit = {
................. 中间省略部分代码 ....................
// 将任务提交至 DAGscheduler 进行Job 任务的划分
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
rdd.doCheckpoint()
}
提交 Stage
org.apache.spark.scheduler.DAGScheduler
def runJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): Unit = {
val start = System.nanoTime
// 提交 Job
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
................. 中间省略部分代码 ....................
}
}
org.apache.spark.scheduler.DAGScheduler
def submitJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): JobWaiter[U] = {
................. 中间省略部分代码 ....................
assert(partitions.size > 0)
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
// 通过循环事务处理提交 Job
eventProcessLoop.post(JobSubmitted(
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.clone(properties)))
waiter
}
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop
/**
* @Author: Small_Ran
* @Date: 2022/6/1
* @Description: 根据 driver 发送过来的 事件类型,来决定到底做什么
*/
private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
// Job提交事务类型
case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
................. 中间省略部分代码 ....................
}
org.apache.spark.scheduler.DAGScheduler
private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
callSite: CallSite,
listener: JobListener,
properties: Properties) {
................. 中间省略部分代码 ....................
val jobSubmissionTime = clock.getTimeMillis()
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.setActiveJob(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
// 提交 Stage
submitStage(finalStage)
}
分配 Task 至 ExecutorBackend
org.apache.spark.scheduler.DAGScheduler
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
val missing = getMissingParentStages(stage).sortBy(_.id)
logDebug("missing: " + missing)
if (missing.isEmpty) {
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
// 提交
submitMissingTasks(stage, jobId.get)
} else {
for (parent <- missing) {
submitStage(parent)
}
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id, None)
}
}
org.apache.spark.scheduler.DAGScheduler
private def submitMissingTasks(stage: Stage, jobId: Int) {
................. 中间省略部分代码 ....................
if (tasks.size > 0) {
logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +
s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
// 提交任务至 TaskScheduler
taskScheduler.submitTasks(new TaskSet(
tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
} else {
................. 中间省略部分代码 ....................
}
logDebug(debugString)
submitWaitingChildStages(stage)
}
}
org.apache.spark.scheduler.TaskSchedulerImpl
override def submitTasks(taskSet: TaskSet) {
val tasks = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
................. 中间省略部分代码 ....................
// 准备通过 SchedulerBackend 分配任务至 Executor
backend.reviveOffers()
}
org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.DriverEndpoint
private def makeOffers() {
// Make sure no executor is killed while some task is launching on it
val taskDescs = CoarseGrainedSchedulerBackend.this.synchronized {
// Filter out executors under killing
val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
val workOffers = activeExecutors.map { case (id, executorData) =>
new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
}.toIndexedSeq
scheduler.resourceOffers(workOffers)
}
if (!taskDescs.isEmpty) {
// 启动任务
launchTasks(taskDescs)
}
}
org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.DriverEndpoint
private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
for (task <- tasks.flatten) {
val serializedTask = TaskDescription.encode(task)
if (serializedTask.limit >= maxRpcMessageSize) {
................. 中间省略部分代码 ....................
}
else {
val executorData = executorDataMap(task.executorId)
executorData.freeCores -= scheduler.CPUS_PER_TASK
logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +
s"${executorData.executorHost}.")
// 将任务提交至 ExecutorBackend
executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))
}
}
}
Executor 执行任务
org.apache.spark.executor.CoarseGrainedExecutorBackend
override def receive: PartialFunction[Any, Unit] = {
// 反向注册成功信息
case RegisteredExecutor =>
................. 中间省略部分代码 ....................
// Executor启动信息
case LaunchTask(data) =>
if (executor == null) {
exitExecutor(1, "Received LaunchTask command but executor was null")
} else {
val taskDesc = TaskDescription.decode(data.value)
logInfo("Got assigned task " + taskDesc.taskId)
// 开始Executor启动执行任务
executor.launchTask(this, taskDesc)
}
................. 中间省略部分代码 ....................
}
SparkContext 初始化过程
流程源码说明
创建 Spark Job 任务环境 SparkEnv
创建SparkEnv 主要的作用是初始化各个 Manager 对象,包括:SortShuffleManager()、useLegacyMemoryManager、broadcastManager等等。
org.apache.spark.SparkContext
_env = createSparkEnv(_conf, isLocal, listenerBus)
SparkEnv.set(_env)
org.apache.spark.SparkEnv
private def create(
conf: SparkConf,
executorId: String,
bindAddress: String,
advertiseAddress: String,
port: Int,
isLocal: Boolean,
numUsableCores: Int,
ioEncryptionKey: Option[Array[Byte]],
listenerBus: LiveListenerBus = null,
mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {
................. 中间省略部分代码 ....................
// 初始化 NettyRpcEnv
val systemName = if (isDriver) driverSystemName else executorSystemName
val rpcEnv = RpcEnv.create(systemName, bindAddress, advertiseAddress, port, conf,
securityManager, clientMode = !isDriver)
................. 中间省略部分代码 ....................
// 初始化serializerManager(序列化、反序列化管理器)
val serializerManager = new SerializerManager(serializer, conf, ioEncryptionKey)
................. 中间省略部分代码 ....................
// 初始化广播管理器
val broadcastManager = new BroadcastManager(isDriver, conf, securityManager)
................. 中间省略部分代码 ....................
// 初始化 serializerManager
// Let the user specify short names for shuffle managers
val shortShuffleMgrNames = Map(
"sort" -> classOf[org.apache.spark.shuffle.sort.serializerManager].getName,
"tungsten-sort" -> classOf[org.apache.spark.shuffle.sort.SortShuffleManager].getName)
val shuffleMgrName = conf.get("spark.shuffle.manager", "sort")
val shuffleMgrClass =
shortShuffleMgrNames.getOrElse(shuffleMgrName.toLowerCase(Locale.ROOT), shuffleMgrName)
val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass)
// 初始化 UnifiedMemoryManager(统一内存管理模型)、另一种内存管理模型 StaticMemoryManager(静态内存管理模型)
// spark.memory.useLegacyMode 设置为 false(默认),则为UnifiedMemoryManager;否则为 StaticMemoryManager
val useLegacyMemoryManager = conf.getBoolean("spark.memory.useLegacyMode", false)
val memoryManager: MemoryManager =
if (useLegacyMemoryManager) {
new StaticMemoryManager(conf, numUsableCores)
} else {
UnifiedMemoryManager(conf, numUsableCores)
}
................. 中间省略部分代码 ....................
}
创建并初始化 SparkUI
创建 SparkUI 主要作用在于后面运行 Job 时,可以通过 Web 界面实时的查看 Job 的运行情况。
org.apache.spark.SparkContext
_ui =
if (conf.getBoolean("spark.ui.enabled", true)) {
Some(SparkUI.createLiveUI(this, _conf, listenerBus, _jobProgressListener,
_env.securityManager, appName, startTime = startTime))
} else {
// For tests, do not enable the UI
None
}
// Bind the UI before starting the task scheduler to communicate
// the bound port to the cluster manager properly
_ui.foreach(_.bind())
创建心跳接收器
创建 HeartbeatReceiver 主要用于接收心跳,需要在 createTaskScheduler 之前注册 HeartbeatReceiver ,因为 Executor 将在构造函数中检索 HeartbeatReceiver;创建一个HeartbeatReceiver 的 RpcEndpoint 注册到 RpcEnv 中,每分钟给自己发送 ExpireDeadHosts 检测 Executor 是否存在心跳;如果当前时间减去最后一次心跳时间大于1分钟
,就会用 CoarseGrainedSchedulerBackend 将 Executor 杀死;Driver端心跳接收器,用于接收各个 Executor 的心跳。
org.apache.spark.SparkContext
_heartbeatReceiver = env.rpcEnv.setupEndpoint(
HeartbeatReceiver.ENDPOINT_NAME, new HeartbeatReceiver(this))
创建 SchedulerBackend 与 TaskScheduler
创建 SchedulerBackend 与 TaskScheduler 是 SparkContext 初始化中重要的部分。其中 SchedulerBackend(通讯后台) 主要是负责 Executor 之间的通信,而 TaskScheduler (任务调度器)主要负责 Job 任务的调度。不同的 Spark 任务提交模式就会创建不一样的 SchedulerBackend 与 TaskScheduler ,这里主要说明一下 Yarn-Cluster 模式;因此 SchedulerBackend 与 TaskScheduler分别是 YarnClusterSchedulerBackend 与 YarnClusterScheduler,下面会进行缘由说明。
org.apache.spark.SparkContext
// Create and start the scheduler
val (sched, ts) = SparkContext.createTaskScheduler(this, master, deployMode)
_schedulerBackend = sched
_taskScheduler = ts
org.apache.spark.SparkContext
private def createTaskScheduler(
sc: SparkContext,
master: String,
deployMode: String): (SchedulerBackend, TaskScheduler) = {
import SparkMasterRegex._
// When running locally, don't try to re-execute tasks on failure.
val MAX_LOCAL_TASK_FAILURES = 1
master match {
case "local" =>
................. 中间省略部分代码 ....................
case LOCAL_N_REGEX(threads) =>
................. 中间省略部分代码 ....................
case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>
................. 中间省略部分代码 ....................
case SPARK_REGEX(sparkUrl) =>
................. 中间省略部分代码 ....................
case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>
................. 中间省略部分代码 ....................
// Spark 外部资源管理器,目前支持三种(KubernetesClusterManager、YarnClusterManager、MesosClusterManager)
case masterUrl =>
// 这里返回的是 YarnClusterManager
val cm = getClusterManager(masterUrl) match {
case Some(clusterMgr) => clusterMgr
case None => throw new SparkException("Could not parse Master URL: '" + master + "'")
}
try {
// 调用的也是 YarnClusterManager 中 createTaskScheduler 与 createSchedulerBackend 方法
val scheduler = cm.createTaskScheduler(sc, masterUrl)
val backend = cm.createSchedulerBackend(sc, masterUrl, scheduler)
cm.initialize(scheduler, backend)
(backend, scheduler)
} catch {
case se: SparkException => throw se
case NonFatal(e) =>
throw new SparkException("External scheduler cannot be instantiated", e)
}
}
}
org.apache.spark.scheduler.cluster.YarnClusterManager
override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
sc.deployMode match {
case "cluster" => new YarnClusterScheduler(sc)
case "client" => new YarnScheduler(sc)
case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
}
}
override def createSchedulerBackend(sc: SparkContext,
masterURL: String,
scheduler: TaskScheduler): SchedulerBackend = {
sc.deployMode match {
case "cluster" =>
new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
case "client" =>
new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
case _ =>
throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
}
}
创建与启动 DAGScheduler
创建 DAGScheduler 同样也是初始化 SparkContext 中重要的部分,DAGScheduler 的主要作用是将 Job 划分成一个或者多个 Stage 。首先 DAGScheduler 在内部会先初始化一个 DAGSchedulerEventProcessLoop (主要作用是处理各种事件) ,对于后面 Job 的提交、取消等都是由它来完成。
org.apache.spark.SparkContext
_dagScheduler = new DAGScheduler(this)
_heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet)
org.apache.spark.scheduler.DAGScheduler
/**
* @Author: Small_Ran
* @Date: 2022/6/1
* @Description: 初始化事件处理器 DAGSchedulerEventProcessLoop
* 主要事件包括:JobSubmitted JobCancelled
*/
private[scheduler] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this)
taskScheduler.setDAGScheduler(this)
................. 中间省略部分代码 ....................
// 启动 事件处理循环处理器
eventProcessLoop.start()
org.apache.spark.util.EventLoop
/**
* @Author: Small_Ran
* @Date: 2022/6/1
* @Description:
* 1、eventThread 专门用于处理 JobSubmitted
* 2、eventThread.run 是在 eventProcessLoop.start() 开始启动
*/
private val eventThread = new Thread(name) {
setDaemon(true)
override def run(): Unit = {
try {
while (!stopped.get) {
// 从队列中获取提交的事件,并循环开始执行
// SparkContext 没有初始化完成之前,不会执行 sc.runJob 提交任务的
val event = eventQueue.take()
try {
// 直接就会调用 org.apache.spark.scheduler.DAGSchedulerEventProcessLoop 中的 onReceive 方法
onReceive(event)
................. 中间省略部分代码 ....................
}
}
def start(): Unit = {
if (stopped.get) {
throw new IllegalStateException(name + " has already been stopped")
}
// Call onStart before starting the event thread to make sure it happens before onReceive
onStart()
// 这里就会调用上面的 run 方法
eventThread.start()
}
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop
/**
* @Author: Small_Ran
* @Date: 2022/6/1
* @Description: 根据 driver 发送过来的 事件类型,来决定到底做什么
*/
private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
// Job提交事务类型
case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
................. 中间省略部分代码 ....................
}
启动 TaskScheduler
启动 TaskScheduler 中最重要的步骤是创建 driverEndpoint(主要作用是与 Executor 通讯)。如何创建 driverEndpoint ,则是通过执行 CoarseGrainedSchedulerBackend 中的 start() 方法
;而且我们通过上的 Spark 任务提交大致流程可以知道 CoarseGrainedSchedulerBackend 主要用于与 Executor 建立连接,并将 Task 发送至 Executor 中执行,下面就会对大概流程进行源码说明。
org.apache.spark.SparkContext
// YarnClusterScheduler -> YarnScheduler -> TaskSchedulerImpl
// 所以这里实现的是 TaskSchedulerImpl 中的 start()
_taskScheduler.start()
org.apache.spark.scheduler.TaskSchedulerImpl
override def start() {
// 这里的 backend 封装指的是 YarnClusterSchedulerBackend
// YarnClusterSchedulerBackend -> YarnSchedulerBackend -> CoarseGrainedSchedulerBackend
// 所以这里最后面也会实现 CoarseGrainedSchedulerBackend 中 start()
backend.start()
if (!isLocal && conf.getBoolean("spark.speculation", false)) {
logInfo("Starting speculative execution thread")
speculationScheduler.scheduleWithFixedDelay(new Runnable {
override def run(): Unit = Utils.tryOrStopSparkContext(sc) {
checkSpeculatableTasks()
}
}, SPECULATION_INTERVAL_MS, SPECULATION_INTERVAL_MS, TimeUnit.MILLISECONDS)
}
}
org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
override def start() {
val properties = new ArrayBuffer[(String, String)]
for ((key, value) <- scheduler.sc.conf.getAll) {
if (key.startsWith("spark.")) {
properties += ((key, value))
}
}
// TODO (prashant) send conf instead of properties
// 这里就会 new DriverEndpoint 对象
driverEndpoint = createDriverEndpointRef(properties)
}
org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.DriverEndpoint
/**
* @Author: Small_Ran
* @Date: 2022/6/9
* @Description:
* 1、onStart() 对象实例初始化之后自动执行
* 2、receive() 核心业务功能处理就在这里了
* 3、onStop() 对象实例销毁之前自动调用执行
*/
override def onStart() {
// Periodically revive offers to allow delay scheduling to work
val reviveIntervalMs = conf.getTimeAsMs("spark.scheduler.revive.interval", "1s")
reviveThread.scheduleAtFixedRate(new Runnable {
override def run(): Unit = Utils.tryLogNonFatalError {
Option(self).foreach(_.send(ReviveOffers))
}
}, 0, reviveIntervalMs, TimeUnit.MILLISECONDS)
}
怎么知道SparkContext初始化完成?
在 SparkContext 初始化完成的时候,会调用 YarnClusterScheduler 类中的 postStartHook 方法,而该方法会通知 ApplicationMaster 已经初始化好了 SparkContext 。