1.宏观概览
- Application
- spark-submit
- Driver
- SparkContext
- Master
- Worker
- Executor
- Job
- DAGScheduler
- TaskScheduler
- ShuffleMap Task
- Result Task
2.窄依赖与宽依赖
(1)窄依赖: 一个RDD,对它的“父”RDD, 只有简单的一对一依赖关系。 即RDD的每个partition,仅仅依赖于“父”RDD中的一个partition。
(2)宽依赖: 一个RDD,对它的“父”RDD, 是一对多/多对多依赖关系。 即RDD的每个partition,依赖于“父”RDD中的多个partition(每一个“父”RDD中的partition的数据,都可能传输一部分到下一个RDD的每个partition中)。
3.基于Yarn的两种提交模式
(1)Spark三种提交模式
[1] standalone模式,基于Spark的Master-Worker集群
[2] 基于yarn-cluster模式
[3] 基于yarn-client模式
(2)如何切换至基于yarn的模式?
将用于提交Spark应用程序的spark-submit脚本,加上–master参数,设置为yarn-cluster或yarn-client。如果没有设置,则默认为standalone模式。
(3)yarn-cluster
(4)yarn-client
(5)说明
[1] yarn-client只用于测试。因为Driver进程固定运行在本地,负责调度Application,会与yarn集群产生超大量的网络通信,从而导致网卡流量激增。好处在于,直接执行时,在本地可以看到所有log。
[2] yarn-cluster用于生产环境。因为Driver运行在NodeManager,不存在流量激增的问题。用集群中的随机节点代替本地固定节点运行ApplicationMaster进行调度。缺点在于调试不方便,本地用spark-submit提交后看不到log,只能通过yarn-application-logs {APPLICATION_ID} 命令查看。
[3] 要使用yarn模式,需要在conf/spark-env.sh中,export HADOOP_HONE=… …
4. SparkContext
(1)初始化机制:三大核心部件
(2)源码分析
//createTaskScheduler()中的一个分支,常用的提交模式中的standalone
case SPARK_REGEX(sparkUrl) =>
val scheduler = new TaskSchedulerImpl (sc )
val masterUrls = sparkUrl.split( ",").map("spark://" + _)
val backend = new StandaloneSchedulerBackend (scheduler , sc , masterUrls )
scheduler.initialize (backend) //TaskScheduler创建完成
( backend, scheduler)
//TaskSchedulerImpl 源码注释
/**
* Schedules tasks for multiple types of clusters by acting through a SchedulerBackend.
* It can also work with a local setup by using a `LocalSchedulerBackend` and setting
* isLocal to true. It handles common logic, like determining a scheduling order across jobs, waking
* up to launch speculative tasks(推测任务), etc.
*
* Clients should first call initialize() and start(), then submit task sets through the
* runTasks method.
*/
//initialize()
def initialize(backend: SchedulerBackend) {
this. backend = backend
schedulableBuilder = {
schedulingMode match {
case SchedulingMode. FIFO =>
new FIFOSchedulableBuilder(rootPool)
case SchedulingMode. FAIR =>
new FairSchedulableBuilder(rootPool, conf)
case _ =>
throw new IllegalArgumentException(s"Unsupported $SCHEDULER_MODE_PROPERTY: " +
s "$schedulingMode ")
}
}
schedulableBuilder.buildPools()
}
//start()
override def start () {
backend.start() //关键在于调用backend的start()
if (! isLocal && conf.getBoolean("spark.speculation" , false)) {
logInfo("Starting speculative execution thread" )
speculationScheduler.scheduleWithFixedDelay( new Runnable {
override def run(): Unit = Utils.tryOrStopSparkContext(sc) {
checkSpeculatableTasks()
}
}, SPECULATION_INTERVAL_MS, SPECULATION_INTERVAL_MS, TimeUnit.MILLISECONDS)
}
}
//描述了当前执行的自定义应用 Appication ※非常重要
private[spark] caseclass ApplicationDescription(
name: String,
maxCores: Option[Int], //shell脚本对应的num-executors*executor-cores
memoryPerExecutorMB: Int, //shell脚本的executor-memory
command: Command,
appUiUrl: String,
eventLogDir: Option[URI] = None,
// short name of compression codec used when writing event logs, if any (e.g. lzf)
eventLogCodec: Option[ String] = None ,
coresPerExecutor: Option[Int] = None,
// number of executors this application wants to start with,
// only used if dynamic allocation is enabled
initialExecutorLimit: Option[Int] = None,
user: String = System. getProperty("user.name" , "<unknown>" )) {
override def toString : String = "ApplicationDescription(" + name + ")"
}
/**
* Interface allowing applications to speak with a Spark standalone cluster manager.
*
* Takes a master URL, an app description, and a listener for cluster events, and calls
* back the listener when various events occur.
*
* @param masterUrls Each url should look like spark://host:port.
*/
private[spark] class StandaloneAppClient(
rpcEnv: RpcEnv,
masterUrls: Array[ String],
appDescription: ApplicationDescription,
listener: StandaloneAppClientListener,
conf: SparkConf)
extends Logging
//...
/**
* Register with all masters asynchronously. It will call `registerWithMaster` every
* REGISTRATION_TIMEOUT_SECONDS seconds until exceeding REGISTRATION_RETRIES times.
* Once we connect to a master successfully, all scheduling work and Futures will be cancelled.
*
* nthRetry means this is the nth attempt to register with master.
*/
private def registerWithMaster (nthRetry : Int) {
registerMasterFutures. set( tryRegisterAllMasters())
registrationRetryTimer. set(registrationRetryThread.schedule( new Runnable {
override def run(): Unit = {
if (registered.get) {
registerMasterFutures.get.foreach(_.cancel( true))
registerMasterThreadPool.shutdownNow()
} else if (nthRetry >= REGISTRATION_RETRIES) {
markDead( "All masters are unresponsive! Giving up." )
} else {
registerMasterFutures.get.foreach(_.cancel( true))
registerWithMaster(nthRetry + 1)
}
}
}, REGISTRATION_TIMEOUT_SECONDS, TimeUnit.SECONDS))
}
//...
/**
* Register with all masters asynchronously and returns an array `Future`s for cancellation.
*/
private def tryRegisterAllMasters (): Array[JFuture[_]] = {
for ( masterAddress <- masterRpcAddresses) yield {
registerMasterThreadPool.submit( new Runnable {
override def run(): Unit = try {
if (registered.get) {
return
}
logInfo( "Connecting to master " + masterAddress.toSparkURL + "..." )
val masterRef = rpcEnv.setupEndpointRef(masterAddress, Master.ENDPOINT_NAME)
masterRef.send(RegisterApplication(appDescription, self))
} catch {
case ie: InterruptedException => // Cancelled
case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e)
}
})
}
}
/**
* The high-level scheduling layer that implements stage-oriented scheduling. It computes a DAG(有向无环图) of
* stages for each job, keeps track of which RDDs and stage outputs are materialized(写入磁盘/内存), and finds a
* minimal schedule to run the job. It then submits stages as TaskSets to an underlying
* TaskScheduler implementation that runs them on the cluster. A TaskSet contains fully independent
* tasks that can run right away based on the data that's already on the cluster (e.g. map output
* files from previous stages), though it may fail if this data becomes unavailable.
*
* Spark stages are created by breaking the RDD graph at shuffle boundaries. RDD operations with
* "narrow" dependencies, like map() and filter(), are pipelined together into one set of tasks
* in each stage, but operations with shuffle dependencies require multiple stages (one to write a
* set of map output files, and another to read those files after a barrier). In the end, every
* stage will have only shuffle dependencies on other stages, and may compute multiple operations
* inside it. The actual pipelining of these operations happens in the RDD.compute() functions of
* various RDDs
*
* In addition to coming up with a DAG of stages, the DAGScheduler also determines the preferred
* locations to run each task on, based on the current cache status, and passes these to the
* low-level TaskScheduler. Furthermore, it handles failures due to shuffle output files being
* lost, in which case old stages may need to be resubmitted. Failures *within* a stage that are
* not caused by shuffle file loss are handled by the TaskScheduler, which will retry each task
* a small number of times before cancelling the whole stage.
*
* When looking through this code, there are several key concepts:
*
* - Jobs (represented by [[ActiveJob]]) are the top-level work items submitted to the scheduler.
* For example, when the user calls an action, like count(), a job will be submitted through
* submitJob. Each Job may require the execution of multiple stages to build intermediate data.
*
* - Stages ([[Stage]]) are sets of tasks that compute intermediate results in jobs, where each
* task computes the same function on partitions of the same RDD. Stages are separated at shuffle
* boundaries, which introduce a barrier (where we must wait for the previous stage to finish to
* fetch outputs). There are two types of stages: [[ResultStage]], for the final stage that
* executes an action, and [[ShuffleMapStage]], which writes map output files for a shuffle.
* Stages are often shared across multiple jobs, if these jobs reuse the same RDDs.
*
* - Tasks are individual units of work, each sent to one machine.
*
* - Cache tracking: the DAGScheduler figures out which RDDs are cached to avoid recomputing them
* and likewise remembers which shuffle map stages have already produced output files to avoid
* redoing the map side of a shuffle.
*
* - Preferred locations: the DAGScheduler also computes where to run each task in a stage based
* on the preferred locations of its underlying RDDs, or the location of cached or shuffle data.
*
* - Cleanup: all data structures are cleared when the running jobs that depend on them finish,
* to prevent memory leaks in a long-running application.
*
* To recover from failures, the same stage might need to run multiple times, which are called
* "attempts". If the TaskScheduler reports that a task failed because a map output file from a
* previous stage was lost, the DAGScheduler resubmits that lost stage. This is detected through a
* CompletionEvent with FetchFailed, or an ExecutorLost event. The DAGScheduler will wait a small
* amount of time to see whether other nodes or tasks fail, then resubmit TaskSets for any lost
* stage(s) that compute the missing tasks. As part of this process, we might also have to create
* Stage objects for old (finished) stages where we previously cleaned up the Stage object. Since
* tasks from the old attempt of a stage could still be running, care must be taken to map any
* events received in the correct Stage object.
*
* Here's a checklist to use when making or reviewing changes to this class:
*
* - All data structures should be cleared when the jobs involving them end to avoid indefinite
* accumulation of state in long-running programs.
*
* - When adding a new data structure, update `DAGSchedulerSuite.assertDataStructuresEmpty` to
* include the new structure. This will help to catch memory leaks.
*/
private[spark] class DAGScheduler(
private[scheduler] val sc : SparkContext,
private[scheduler] val taskScheduler : TaskScheduler ,
listenerBus: LiveListenerBus,
mapOutputTracker: MapOutputTrackerMaster,
blockManagerMaster: BlockManagerMaster,
env: SparkEnv,
clock: Clock = new SystemClock ())
extends Logging
private[spark] class SparkUI private (
val store: AppStatusStore,
val sc: Option[SparkContext],
val conf: SparkConf,
securityManager: SecurityManager,
var appName: String,
val basePath: String,
val startTime: Long,
val appSparkVersion: String)
extends WebUI(securityManager, securityManager.getSSLOptions( "ui"), SparkUI.getUIPort(conf),
conf, basePath, "SparkUI")
with Logging
with UIRoot
//...
private[spark] object SparkUI {
val DEFAULT_PORT = 4040
val STATIC_RESOURCE_DIR = "org/apache/spark/ui/static"
val DEFAULT_POOL_NAME = "default"
//...
}
5. Master
(1)主备切换——HA机制
Spark Master主备切换可以基于两种机制:一种基于文件系统如HDFS(需要手动激活StandbyMaster以切换),一种基于ZooKeeper(可实现自动切换Master)。
private[deploy] class Master(
override val rpcEnv : RpcEnv,
address: RpcAddress,
webUiPort: Int,
val securityMgr: SecurityManager,
val conf: SparkConf)
extends ThreadSafeRpcEndpoint with Logging with LeaderElectable
//总结清理机制:1. 从内存缓存结构中移除;2.从相关的组件内存缓存中移除;3.从持久化存储移除
private def completeRecovery() {
// Ensure "only-once" recovery semantics using a short synchronization period.
if (state != RecoveryState.RECOVERING) { return }
state = RecoveryState.COMPLETING_RECOVERY
// Kill off any workers and apps that didn't respond to us.
workers.filter(_.state == WorkerState.UNKNOWN). foreach(
removeWorker(_, "Not responding for recovery"))
apps.filter(_.state == ApplicationState.UNKNOWN). foreach(finishApplication)
// Update the state of recovered apps to RUNNING
apps.filter(_.state == ApplicationState.WAITING). foreach(_.state = ApplicationState.RUNNING)
// Reschedule drivers which were not claimed by any workers
drivers.filter(_.worker.isEmpty). foreach { d =>
logWarning(s "Driver ${d.id} was not found after master recovery" )
if (d.desc.supervise) {
logWarning(s "Re-launching ${d.id}" )
relaunchDriver(d)
} else {
removeDriver(d.id, DriverState.ERROR, None)
logWarning(s "Did not re-launch ${d.id} because it was not supervised" )
}
}
state = RecoveryState.ALIVE
schedule()
logInfo("Recovery complete - resuming operations!" )
}
private def removeWorker (worker : WorkerInfo, msg: String) {
logInfo("Removing worker " + worker .id + " on " + worker .host + ":" + worker.port )
worker.setState(WorkerState.DEAD )
idToWorker -= worker.id //内存缓存HashMap
addressToWorker -= worker.endpoint.address
for ( exec <- worker. executors.values ) {
logInfo("Telling app of lost executor: " + exec .id )
exec.application. driver. send(ExecutorUpdated(
exec.id, ExecutorState.LOST, Some( "worker lost"), None, workerLost = true))
exec.state = ExecutorState.LOST
exec.application. removeExecutor(exec)
}
for ( driver <- worker. drivers. values) {
if (driver.desc.supervise) {
logInfo(s"Re-launching $ {driver .id }" )
relaunchDriver(driver )
} else {
logInfo(s"Not re-launching $ {driver .id } because it was not supervised" )
removeDriver(driver .id , DriverState. ERROR, None)
}
}
logInfo(s"Telling app of lost worker: " + worker .id )
apps.filterNot(completedApps.contains(_)). foreach { app =>
app.driver.send(WorkerRemoved(worker.id, worker.host, msg))
}
persistenceEngine.removeWorker(worker)
}
private[spark] class WorkerInfo(
val id: String,
val host: String,
val port: Int,
val cores: Int,
val memory: Int,
val endpoint: RpcEndpointRef,
val webUiAddress: String)
extends Serializable
private def relaunchDriver (driver : DriverInfo) {
// We must setup a new driver with a new driver id here, because the original driver may
// be still running. Consider this scenario: a worker is network partitioned with master,
// the master then relaunches driver driverID1 with a driver id driverID2, then the worker
// reconnects to master. From this point on, if driverID2 is equal to driverID1, then master
// can not distinguish the statusUpdate of the original driver and the newly relaunched one,
// for example, when DriverStateChanged(driverID1, KILLED) arrives at master, master will
// remove driverID1, so the newly relaunched driver disappears too. See SPARK-19900 for details.
removeDriver( driver. id, DriverState.RELAUNCHING, None)
val newDriver = createDriver(driver .desc )
persistenceEngine.addDriver(newDriver)
drivers.add(newDriver)
waitingDrivers += newDriver //ArrayBuffer
schedule()
}
private def removeDriver (
driverId: String,
finalState: DriverState,
exception: Option[ Exception]) {
drivers.find(d => d.id == driverId) match {
case Some(driver ) =>
logInfo(s"Removing driver: $ driverId")
drivers -= driver
if (completedDrivers.size >= RETAINED_DRIVERS) {
val toRemove = math.max (RETAINED_DRIVERS / 10 , 1 )
completedDrivers.trimStart(toRemove)
}
completedDrivers += driver
persistenceEngine.removeDriver(driver)
driver.state = finalState
driver.exception = exception
driver.worker.foreach(w => w.removeDriver(driver))
schedule()
case None =>
logWarning(s"Asked to remove unknown driver: $ driverId")
}
}
private def finishApplication (app : ApplicationInfo) {
removeApplication( app, ApplicationState.FINISHED)
}
def removeApplication( app: ApplicationInfo, state: ApplicationState.Value) {
if (apps.contains(app)) {
logInfo("Removing app " + app .id )
apps -= app
idToApp -= app.id
endpointToApp -= app.driver
addressToApp -= app.driver.address
if (completedApps.size >= RETAINED_APPLICATIONS) {
val toRemove = math.max (RETAINED_APPLICATIONS / 10 , 1 )
completedApps.take(toRemove). foreach { a =>
applicationMetricsSystem.removeSource(a.appSource)
}
completedApps.trimStart(toRemove)
}
completedApps += app // Remember it in our history
waitingApps -= app
for ( exec <- app. executors.values ) {
killExecutor(exec)
}
app.markFinished(state)
if (state != ApplicationState.FINISHED) {
app. driver. send(ApplicationRemoved(state.toString))
}
persistenceEngine.removeApplication(app)
schedule()
// Tell all workers that the application has finished, so they can clean up any app state.
workers.foreach { w =>
w. endpoint.send(ApplicationFinished(app.id))
}
}
}
(2)注册机制——Worker,Driver,Application
case RegisterApplication(description , driver ) =>
// TODO Prevent repeated registrations from some driver
if (state == RecoveryState.STANDBY) {
// ignore, don't send response
} else {
logInfo("Registering app " + description .name )
val app = createApplication (description , driver )
registerApplication(app )
logInfo("Registered app " + description .name + " with ID " + app .id )
persistenceEngine.addApplication(app)
driver. send(RegisteredApplication(app.id, self))
schedule()
}
private def createApplication (desc : ApplicationDescription, driver: RpcEndpointRef):
ApplicationInfo = {
val now = System. currentTimeMillis()
val date = new Date( now)
val appId = newApplicationId( date)
new ApplicationInfo(now , appId , desc , date , driver , defaultCores)
}
private def registerApplication (app : ApplicationInfo): Unit = {
val appAddress = app. driver. address
if (addressToApp.contains(appAddress)) {
logInfo("Attempted to re-register application at same address: " + appAddress)
return
}
applicationMetricsSystem.registerSource(app.appSource)
//将app的信息加入内存缓存中
apps += app //apps - HashSet
idToApp(app.id) = app
endpointToApp(app.driver) = app
addressToApp(appAddress) = app
//将app的信息加入等待调度队列
waitingApps += app //waitingApps - ArrayBuffer
}
(3)状态改变机制
case DriverStateChanged(driverId , state , exception ) =>
state match {
case DriverState. ERROR | DriverState.FINISHED | DriverState.KILLED | DriverState.FAILED =>
removeDriver(driverId , state , exception )
case _ =>
throw new Exception(s"Received unexpected state update for driver $driverId: $state ")
}
private def removeDriver (
driverId: String,
finalState: DriverState,
exception: Option[ Exception]) {
//高阶函数find
drivers.find(d => d.id == driverId) match {
//Some(样例类Option)
case Some(driver ) =>
logInfo(s"Removing driver: $ driverId")
//从内存缓存中移除
drivers -= driver
if (completedDrivers.size >= RETAINED_DRIVERS) {
val toRemove = math.max (RETAINED_DRIVERS / 10 , 1 )
completedDrivers.trimStart(toRemove)
}
//加入完成队列
completedDrivers += driver
persistenceEngine.removeDriver(driver)
driver.state = finalState
driver.exception = exception
driver.worker.foreach(w => w.removeDriver(driver))
schedule()
case None =>
logWarning(s"Asked to remove unknown driver: $ driverId")
}
}
case ExecutorStateChanged(appId , execId , state , message , exitStatus ) =>
val execOption = idToApp.get(appId). flatMap(app => app.executors.get(execId))
execOption match {
case Some (exec ) =>
val appInfo = idToApp(appId)
val oldState = exec.state
exec.state = state
if (state == ExecutorState .RUNNING ) {
assert( oldState == ExecutorState.LAUNCHING,
s"executor $ execId state transfer from $oldState to RUNNING is illegal")
appInfo. resetRetryCount()
}
//向driver同步发送ExecutorUpdated消息
exec.application.driver.send(ExecutorUpdated( execId, state, message, exitStatus, false ))
//关键
if (ExecutorState .isFinished (state )) {
// Remove this executor from the worker and app
logInfo(s"Removing executor ${ exec.fullId} because it is $state ")
// If an application has already finished, preserve its
// state to display its information properly on the UI
if (!appInfo .isFinished ) {
appInfo. removeExecutor(exec)
}
exec.worker.removeExecutor( exec)
val normalExit = exitStatus == Some(0)
// Only retry certain number of times 10 so we don't go into an infinite loop.
// Important note: this code path is not exercised by tests, so be very careful when
// changing this `if` condition.
if (!normalExit
&& appInfo .incrementRetryCount () >= MAX_EXECUTOR_RETRIES
&& MAX_EXECUTOR_RETRIES >= 0 ) { // < 0 disables this application-killing path
val execs = appInfo .executors .values
if (!execs .exists (_.state == ExecutorState.RUNNING)) {
logError(s"Application ${ appInfo. desc. name} with ID ${appInfo .id } failed " +
s "${ appInfo. retryCount} times; removing it" )
removeApplication(appInfo , ApplicationState. FAILED)
}
}
}
schedule()
case None =>
logWarning(s"Got status update for unknown executor $appId/$ execId")
}
[重要的调度机制schedule()见下篇续]