Master主备切换机制
Master
/**
* 完成master的主备切换,就是完成master的恢复
*/
def completeRecovery() {
// Ensure "only-once" recovery semantics using a short synchronization period.
synchronized {
if (state != RecoveryState.RECOVERING) { return }
state = RecoveryState.COMPLETING_RECOVERY
}
// Kill off any workers and apps that didn't respond to us.
//将application和worker过滤出来目前状态还是unknown的,
//然后遍历,分别调用removeWorker和finishApplication方法,
//对可能已经出故障甚至已经死掉的application和worker进行清理
//清理机制:1.从内存缓存中移除,2.从相关组件的内存缓存中移除,3.从持久化存储中移除
workers.filter(_.state == WorkerState.UNKNOWN).foreach(removeWorker)
apps.filter(_.state == ApplicationState.UNKNOWN).foreach(finishApplication)
// Reschedule drivers which were not claimed by any workers
drivers.filter(_.worker.isEmpty).foreach { d =>
logWarning(s"Driver ${d.id} was not found after master recovery")
if (d.desc.supervise) {
logWarning(s"Re-launching ${d.id}")
relaunchDriver(d)
} else {
removeDriver(d.id, DriverState.ERROR, None)
logWarning(s"Did not re-launch ${d.id} because it was not supervised")
}
}
state = RecoveryState.ALIVE
schedule()
logInfo("Recovery complete - resuming operations!")
}
Master注册机制
/**
* 处理application注册的请求
*/
case RegisterApplication(description) => {
if (state == RecoveryState.STANDBY) {
// ignore, don't send response
} else {
logInfo("Registering app " + description.name)
//用application信息创建applicationInfo
val app = createApplication(description, sender)
//注册application
//将applicationinfo加入缓存,将application加入等待调度的队列---waitingapps
registerApplication(app)
logInfo("Registered app " + description.name + " with ID " + app.id)
//使用持久化引擎,将Applicationinfo进行持久化
persistenceEngine.addApplication(app)
// sender 多个actor相互发消息
//反向向sparkDeplySchedulerBackend的appClient的ClientActor发送消息,
// 也就是RegisteredApplication
sender ! RegisteredApplication(app.id, masterUrl)
schedule()
}
}
注意:
// AppClient to Master
case class RegisterApplication(appDescription: ApplicationDescription)、
extends DeployMessage
// Master to AppClient
case class RegisteredApplication(appId: String, masterUrl: String) extends DeployMessage
这里与appactor向master注册相互发消息
schedule源码----资源调度算法
Random.shuffle 对状态为alive的woker,调用Random.shuffle方法进行随机打乱
driver调度
Application的调度
- spreadOutApps算法 : 平均分配,尽量打散 默认配置
- 非spreadOutApps算法: 尽可能少的分配到worker上 , 尽量集中
private def schedule() {
if (state != RecoveryState.ALIVE) { return }
/**
* First schedule drivers, they take strict precedence over applications
* Randomization helps balance drivers
* Random.shuffle的原理:对传入的集合的元素进行随机的打乱
* 取出wokers中的所有之前注册上来的woker进行过滤,必须状态为Alive的woker
* 对状态为alive的woker,调用Random.shuffle方法进行随机打乱
*/
val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE))
val numWorkersAlive = shuffledAliveWorkers.size
var curPos = 0
/**
* 调度driver,什么情况下,会注册driver,并且会导致driver被调度?
* 只有yarn-cluster模式提交的时候,才会注册driver,因为standalone和yarn-client会在本地
* 直接启动driver,而不会注册driver,更不可能让master调度driver了
*
* driver的调度机制:遍历waitingDrivers ArrayBuffer
*/
for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers
// We assign workers to each waiting driver in a round-robin fashion. For each driver, we
// start from the last worker that was assigned a driver, and continue onwards until we have
// explored all alive workers.
var launched = false
var numWorkersVisited = 0
// 当前这个driver还没有启动,即launched=false,只要有活着的worker没有遍历到就继续遍历
while (numWorkersVisited < numWorkersAlive && !launched) {
val worker = shuffledAliveWorkers(curPos)
numWorkersVisited += 1
// 当前这个worker空闲内存大于等于driver需要的内存&空间cpu>=driver需要的cpu数量
if (worker.memoryFree >= driver.desc.mem && worker.coresFree >= driver.desc.cores) {
// 启动driver
launchDriver(worker, driver)
// 将driver从waitingDrivers队列中移除
waitingDrivers -= driver
// launched置为true,结束此循环
launched = true
}
// 将指针指向下一个worker
curPos = (curPos + 1) % numWorkersAlive
}
}
// Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
// in the queue, then the second app, etc.
/**
* TODO Application的调度机制:
* 1) spreadOutApps算法 : 平均分配,尽量打散
* 2) 非spreadOutApps算法: 尽可能少的分配到worker上 , 尽量集中
*/
if (spreadOutApps) {
// Try to spread out each app among all the nodes, until it has all its cores
// 遍历waitingApps中的ApplicationInfo,并且过滤出还有需要调度的core的Application
for (app <- waitingApps if app.coresLeft > 0) {
val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
.filter(canUse(app, _)).sortBy(_.coresFree).reverse
val numUsable = usableWorkers.length
val assigned = new Array[Int](numUsable) // Number of cores to give on each node
// 获取到底要分配多少cpu,取app剩余要分配的cpu数量和worker总共可用cpu数量的最小值
var toAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)
var pos = 0
/**
* 将每个application要启动的executor都平均分配到各个worker上
* 比如20个core要分配,10个worker, 每个worker上2个core
*/
while (toAssign > 0) {
// 每个worker空闲的core>=该worker上分配出去的core数量,即worker上还有可分配的core数量
if (usableWorkers(pos).coresFree - assigned(pos) > 0) {
// 将总共要分配的core数量-1,因为这里已经决定这个worker上分配了一个core
toAssign -= 1
// 给这个worker上分配的cpu数量+1
assigned(pos) += 1
}
// 指针移向下一个worker,对下一个worker做以上同样的操作
pos = (pos + 1) % numUsable
}
// Now that we've decided how many cores to give on each node, let's actually give them
// 给每个worker分配完application要求的core之后
// 遍历worker
for (pos <- 0 until numUsable) {
// 只要判断之前给这个worker分配到了core
if (assigned(pos) > 0) {
/**
* 注:在spark-1.3.0版本
* spark-submit -num-executors num-core num-mem
* 基于这个机制,实际上,最后,executor的数量,以及每个executor的cpu,可能与配置不一样,
* 因为,我们这里是基于总的cpu来分配的,就是说,比如要求3个executor,每个3个cpu,那么
* 比如,有9个worker,每个有一个cpu,那么其实总共知道,要分配9个core,其实根据这种算法,
* 会给每个worker分配一个core,然后每个worker启动一个executor
* 最后会有9个executor,每个executor有1个core
*/
val exec = app.addExecutor(usableWorkers(pos), assigned(pos))
// TODO master发送消息让worker启动Executor
launchExecutor(usableWorkers(pos), exec)
app.state = ApplicationState.RUNNING
}
}
}
} else {
/**
* Pack each app into as few nodes as possible until we've assigned all its cores
* 非spreadOutApps算法: 尽可能少的分配到worker上
* 如 10个worker,每个10core,app要分配20个core,只会分配到2个worker上,每个worker占满10个core
* 其余app分配到下一个worker
* 在这种算法下,其实总共只会启动2个executor,每个有10个core吧
*/
for (worker <- workers if worker.coresFree > 0 && worker.state == WorkerState.ALIVE) {
// 遍历application,并且是还有需要分配的core的application
for (app <- waitingApps if app.coresLeft > 0) {
if (canUse(app, worker)) {
// 取worker剩余cpu数量,与app要分配的cpu数量的最小值
val coresToUse = math.min(worker.coresFree, app.coresLeft)
if (coresToUse > 0) {
val exec = app.addExecutor(worker, coresToUse)
launchExecutor(worker, exec)
app.state = ApplicationState.RUNNING
}
}
}
}
}
}