spark源码中,需要在master上进行注册的一共有三种:worker,yarn-cluster模式下的driver,application
现在就从worker注册来进入spark源码。
由于worker注册是在master中完成的,所以我们进入到spark-core这个包下,org\apache\spark\deploy\master\Master.scala这个类中,master完成worker注册的方法叫做receive(),这个方法传进来一个any,没有返回值。这是一个样例类,会匹配worker和application的注册
case RegisterWorker(
id, workerHost, workerPort, workerRef, cores, memory, workerWebUiUrl, masterAddress) =>
logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
workerHost, workerPort, cores, Utils.megabytesToString(memory)))
// 如果这个master的状态是standby 就向worker发送MasterInStandby,不做注册
if (state == RecoveryState.STANDBY) {
workerRef.send(MasterInStandby)
// 如果已经有这个workerId了 就发送已经存在这个worker的消息
} else if (idToWorker.contains(id)) {
workerRef.send(RegisterWorkerFailed("Duplicate worker ID"))
} else {
// 新建一个这个worker的info信息
val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory,
workerRef, workerWebUiUrl)
// 如果这个worker是可以注册成功的,那么就注册这个worker。
//这里主要是判断这个worker的状态不是dead和unknow
if (registerWorker(worker)) {
// 持久化引擎持久化这个worker,并且向work发生master的地址
persistenceEngine.addWorker(worker)
workerRef.send(RegisteredWorker(self, masterWebUiUrl, masterAddress))
// 将worker加入到调度队列中,下一步就是对worker进行调度
schedule()
} else {
val workerAddress = worker.endpoint.address
logWarning("Worker registration failed. Attempted to re-register worker at same " +
"address: " + workerAddress)
workerRef.send(RegisterWorkerFailed("Attempted to re-register worker at same address: "
+ workerAddress))
}
}
registerWorker()是对将要注册的worker进行过滤
private def registerWorker(worker: WorkerInfo): Boolean = {
// 一个节点可能有一个或着多个死亡的worker
// There may be one or more refs to dead workers on this same node (w/ different ID's),
// remove them.
//如果worker的状态为dead,就把这个worker从列表中移除
workers.filter { w =>
(w.host == worker.host && w.port == worker.port) && (w.state == WorkerState.DEAD)
}.foreach { w =>
workers -= w
}
val workerAddress = worker.endpoint.address
if (addressToWorker.contains(workerAddress)) {
val oldWorker = addressToWorker(workerAddress)
if (oldWorker.state == WorkerState.UNKNOWN) {
//如果一个worker的状态是UNKNOWN,意味着在恢复期间这个worker被重启过,旧的worker肯定是dead
//所以,要清除就的worker信息,替换成新的worker
// A worker registering from UNKNOWN implies that the worker was restarted during recovery.
// The old worker must thus be dead, so we will remove it and accept the new worker.
removeWorker(oldWorker, "Worker replaced by a new worker with same address")
} else {
logInfo("Attempted to re-register worker at same address: " + workerAddress)
return false
}
}
// 将正常的worker加入workers队列
workers += worker
idToWorker(worker.id) = worker
addressToWorker(workerAddress) = worker
true
}
移除状态为dead和unknow的worker。由removeWorker()来完成
private def removeWorker(worker: WorkerInfo, msg: String) {
logInfo("Removing worker " + worker.id + " on " + worker.host + ":" + worker.port)
// 将需要移除的worker的状态设置为dead,移除Id和address
worker.setState(WorkerState.DEAD)
idToWorker -= worker.id
addressToWorker -= worker.endpoint.address
// 将这个worker上的executor状态全部设置为lost,并且移除对应应用上的executor
for (exec <- worker.executors.values) {
logInfo("Telling app of lost executor: " + exec.id)
exec.application.driver.send(ExecutorUpdated(
exec.id, ExecutorState.LOST, Some("worker lost"), None, workerLost = true))
exec.state = ExecutorState.LOST
exec.application.removeExecutor(exec)
}
// 如果这个worker上driver是存活状态,再次启动这个driver, 否则直接移除这个driver
for (driver <- worker.drivers.values) {
if (driver.desc.supervise) {
logInfo(s"Re-launching ${driver.id}")
relaunchDriver(driver)
} else {
logInfo(s"Not re-launching ${driver.id} because it was not supervised")
removeDriver(driver.id, DriverState.ERROR, None)
}
}
// 向app的driver发送消息,通知这个worker被移除
logInfo(s"Telling app of lost worker: " + worker.id)
apps.filterNot(completedApps.contains(_)).foreach { app =>
app.driver.send(WorkerRemoved(worker.id, worker.host, msg))
}
// 从持久化引擎中移除这个worker
persistenceEngine.removeWorker(worker)
}
重新启动driver的方法。由relaunchDriver()完成
private def relaunchDriver(driver: DriverInfo) {
// We must setup a new driver with a new driver id here, because the original driver may
// be still running. Consider this scenario: a worker is network partitioned with master,
// the master then relaunches driver driverID1 with a driver id driverID2, then the worker
// reconnects to master. From this point on, if driverID2 is equal to driverID1, then master
// can not distinguish the statusUpdate of the original driver and the newly relaunched one,
// for example, when DriverStateChanged(driverID1, KILLED) arrives at master, master will
// remove driverID1, so the newly relaunched driver disappears too. See SPARK-19900 for details.
// 我们必须要给这个driver一个新的id,因为原来的driver可能还在运行。
// 因为如果有两个driverid是一样的,当worker连接master的时候,master就不能识别哪个是新的driver 哪个是旧的driver
// 如果有个driver状态改变需要kill,那么两个id一样的driver都会被kill掉
// 移除掉状态为RELAUNCHING的driver 用他的信息重新持久化一个新的driver,然后加入到等待调度的队列中
removeDriver(driver.id, DriverState.RELAUNCHING, None)
val newDriver = createDriver(driver.desc)
persistenceEngine.addDriver(newDriver)
drivers.add(newDriver)
waitingDrivers += newDriver
// 这个schedule是调度driver的方法
schedule()
}
回到最初的worker注册,worker一旦注册成功,master就可以在worker上调度driver,由schedule()方法来完成
/**
* Schedule the currently available resources among waiting apps. This method will be called
* every time a new app joins or resource availability changes.
* 我们会对等待调度的程序进行资源调度,每次资源变动都会唤醒这个schedule()方法
*/
private def schedule(): Unit = {
// 如果master的状态不是alive,什么都不做,直接返回
if (state != RecoveryState.ALIVE) {
return
}
// driver严格按照executor来执行
// Drivers take strict precedence over executors
// 获取之前注册的所有alive的worker,并随机打散
val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
val numWorkersAlive = shuffledAliveWorkers.size
var curPos = 0
// 首先调度driver. yarn-cluster模式下的调度
//对于standbyalone和yarn-client模式,driver都是在本地启动,不会向master进行注册。
for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
//以轮询的方式来分发driver
// We assign workers to each waiting driver in a round-robin fashion. For each driver, we
// start from the last worker that was assigned a driver, and continue onwards until we have
// explored all alive workers.
var launched = false
var numWorkersVisited = 0
// 只要还有活着的worker没有遍历到,并且当前这个driver还没有启动
while (numWorkersVisited < numWorkersAlive && !launched) {
//取出一个活着的worker,分发数量+1
val worker = shuffledAliveWorkers(curPos)
numWorkersVisited += 1
// 如果这个worker的空闲内存大于等于这个driver需要的内存并且worker上的空闲核数大于driver需要的核数
// 就在这个worker上启动这个driver,然后将这个driver从等待队列中移除
if (worker.memoryFree >= driver.desc.mem && worker.coresFree >= driver.desc.cores) {
launchDriver(worker, driver)
waitingDrivers -= driver
launched = true
}
//指针指向下一个worker
curPos = (curPos + 1) % numWorkersAlive
}
}
// 启动driver之后,在这个worker上启动executor
startExecutorsOnWorkers()
}
driver在worker上启动成功之后,就会在worker上启动运行app所需要的executor.由startExecutorsOnWorkers()来完成
/**
* Schedule and launch executors on workers
* 调度启动executor
*/
private def startExecutorsOnWorkers(): Unit = {
// Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
// in the queue, then the second app, etc.
// FIFO先进先出的方式调度每个application
for (app <- waitingApps) {
// 拿到这个app中每个executor需要的core,如果没有,默认给1
val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1)
// 如果剩余的core少于这个executor需要的core,剩余的core就不会分配了
// If the cores left is less than the coresPerExecutor,the cores left will not be allocated
if (app.coresLeft >= coresPerExecutor) {
// Filter out workers that don't have enough resources to launch an executor
// 过滤掉没有足够资源启动executor的worker,拿到的都是可以用的worker
val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
// worker的空闲内存大于executor需要的内存 worker剩余的core大于executor需要的core
.filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
worker.coresFree >= coresPerExecutor)
// 按照worker空闲核数倒序排序
.sortBy(_.coresFree).reverse
// 传进来当前的app 可用worker 默认spreadOutApps算法。
//优先满足core 然后再启动executor。先给每个worker分配完这个application要求的core,然后再调度executor
val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)
// Now that we've decided how many cores to allocate on each worker, let's allocate them
for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
allocateWorkerResourceToExecutors(
app, assignedCores(pos), app.desc.coresPerExecutor, usableWorkers(pos))
}
}
}
}
调度executor需要分发cpu core,这个由scheduleExecutorsOnWorkers()方法完成
/**
* Schedule executors to be launched on the workers.
* Returns an array containing number of cores assigned to each worker.
*在worker上启动executor,返回每个worker上被分配了多少个core的一个数组
*
* There are two modes of launching executors. The first attempts to spread out an application's
* executors on as many workers as possible, while the second does the opposite (i.e. launch them
* on as few workers as possible). The former is usually better for data locality purposes and is
* the default.
*有两种模式来启动executor:
* 第一种spreadOutApps尝试分发app的executor到尽可能多的worker上,也就是均匀分布
* 第二种相反,将executor启动在尽可能少的worker上。
* 由于第一种有更好的数据本地性,所以spreadOutApps是默认调度算法
*
* The number of cores assigned to each executor is configurable. When this is explicitly set,
* multiple executors from the same application may be launched on the same worker if the worker
* has enough cores and memory. Otherwise, each executor grabs all the cores available on the
* worker by default, in which case only one executor per application may be launched on each
* worker during one single schedule iteration.
* 每个executor被分配的core都是配置的,当这个值被设定了,如果一个worker上有足够多空闲的core和内存,
* 一个APP的多个executor可能就在同一个worker上启动。如果没有设定每个executor对应的core,那么executor
* 会尽可能获取这个worker上所有可用的core,这种情况下,一次调度一个worker上每个app只会有一个executor被启动
*
* Note that when `spark.executor.cores` is not set, we may still launch multiple executors from
* the same application on the same worker. Consider appA and appB both have one executor running
* on worker1, and appA.coresLeft > 0, then appB is finished and release all its cores on worker1,
* thus for the next schedule iteration, appA launches a new executor that grabs all the free
* cores on worker1, therefore we get multiple executors from appA running on worker1.
* 假如spark.executor.cores没有被设定,我们仍然有可能在同一个worker上给同一个app启动多个executor.
* 我们appA和appB在worker1上共用一个executor,当appA还有需要分配的core,这是appB任务执行完毕,并且释放了他在
* worker1中的所有的core,这个时候appA就会有多个executor运行在worker1
*
* It is important to allocate coresPerExecutor on each worker at a time (instead of 1 core
* at a time). Consider the following example: cluster has 4 workers with 16 cores each.
* User requests 3 executors (spark.cores.max = 48, spark.executor.cores = 16). If 1 core is
* allocated at a time, 12 cores from each worker would be assigned to each executor.
* Since 12 < 16, no executors would launch [SPARK-8881].
* 给每个executor分配core很重要,而不是一次给每个executor分配一个core.比如下面这中情况:
* 集群4个worker,每个worker16个core,一共64个core。用户要求启动3个executor,每个executor16个核,最多需要48core
* 如果一次分发一个core,那么4个worker每个worker会分发12个给executor.
* Since 12 < 16, no executors would launch???
*/
// 接受一个app,可用的worker,调度模型,返回一个数组
private def scheduleExecutorsOnWorkers(
app: ApplicationInfo,
usableWorkers: Array[WorkerInfo],
spreadOutApps: Boolean): Array[Int] = {
val coresPerExecutor = app.desc.coresPerExecutor // 每个executor需要的core
val minCoresPerExecutor = coresPerExecutor.getOrElse(1) //每个executor的最小core,没有就给1
val oneExecutorPerWorker = coresPerExecutor.isEmpty // 如果executor核为1,那么一个worker上启动一个executor
val memoryPerExecutor = app.desc.memoryPerExecutorMB // 每个executor需要的内存
val numUsable = usableWorkers.length // 可用的worker
val assignedCores = new Array[Int](numUsable) // Number of cores to give to each worker 每个worker分配的core数量
val assignedExecutors = new Array[Int](numUsable) // Number of new executors on each worker 每个worker要分配的executor数量
// 要分配的核为app需要的core数量和可用的core的最小值
var coresToAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)
/** Return whether the specified worker can launch an executor for this app. */
// 返回这个指定的worker能否启动executor. 这个pos是指的可用worker的数组下标
def canLaunchExecutor(pos: Int): Boolean = {
val keepScheduling = coresToAssign >= minCoresPerExecutor
// 如果可用的worker上的空闲core - 给这个worker的核数 > 每个executor需要的核
// 因为是spreadOutApps算法,要尽量将executor均匀分配在不同的executor上
val enoughCores = usableWorkers(pos).coresFree - assignedCores(pos) >= minCoresPerExecutor
// 如果我们允许每个worker上多个executor,我们就能够在这个worker上启动一个新的executor,不然,就只能给已经存在的这个worker增加core
// If we allow multiple executors per worker, then we can always launch new executors.
// Otherwise, if there is already an executor on this worker, just give it more cores.
val launchingNewExecutor = !oneExecutorPerWorker || assignedExecutors(pos) == 0
// 如果允许启动新的executor
if (launchingNewExecutor) {
// 已分配内存 = 这个worker上的executor数量 * 每个executor需要的内存
val assignedMemory = assignedExecutors(pos) * memoryPerExecutor
// 内存足够
val enoughMemory = usableWorkers(pos).memoryFree - assignedMemory >= memoryPerExecutor
// app executor数量限制
val underLimit = assignedExecutors.sum + app.executors.size < app.executorLimit
keepScheduling && enoughCores && enoughMemory && underLimit
} else {
// 如果不能启动新的executor,那么就把core分配在老的executor中,worker就不需要空闲内存来启动executor,也不存在executor数量限制
// We're adding cores to an existing executor, so no need
// to check memory and executor limits
keepScheduling && enoughCores
}
}
// Keep launching executors until no more workers can accommodate any
// more executors, or if we have reached this application's limits
// 遍历所有能够启动executor的worker,知道所有worker都不能再启动任何executor ,或者executor达到了application的限制
var freeWorkers = (0 until numUsable).filter(canLaunchExecutor)
while (freeWorkers.nonEmpty) { // 只要还有空闲的worker
freeWorkers.foreach { pos => // 取出这个worker的下标
var keepScheduling = true
while (keepScheduling && canLaunchExecutor(pos)) { // 如果这个worker还能够启动executor
// 每次循环待分配的core要减去每个executor需要的最小core
coresToAssign -= minCoresPerExecutor
// worker上已分配的core 要加上对应的core
assignedCores(pos) += minCoresPerExecutor
// If we are launching one executor per worker, then every iteration assigns 1 core
// to the executor. Otherwise, every iteration assigns cores to a new executor.
if (oneExecutorPerWorker) {
assignedExecutors(pos) = 1
} else {
assignedExecutors(pos) += 1
}
// Spreading out an application means spreading out its executors across as
// many workers as possible. If we are not spreading out, then we should keep
// scheduling executors on this worker until we use all of its resources.
// Otherwise, just move on to the next worker.
// 如果是spreadOutApps算法,那就把executor尽可能的启动在多个worker,否则就启动在一个worker上
// 注意 while循环开始和结束的地方
if (spreadOutApps) {
keepScheduling = false
}
}
}
// 非 spreadOutApps 算法 过滤出下一个能够启动executor的worker
freeWorkers = freeWorkers.filter(canLaunchExecutor)
}
// 返回了一个Int数组,里面装的是每个worker上用了多少core
assignedCores
}