Master
Active Master挂掉切换到Standby Master,完成Master主备切换,就是完成Master的恢复
private def completeRecovery(){
//如果状态不是recovering则返回
if (state != RecoveryState.RECOVERING) { return }
//否则置于正在恢复中completing_recovery
state = RecoveryState.COMPLETING_RECOVERY
// 过滤出来目前状态还是UNKNOWN的,然后遍历,分别调用removerWorker和finishApplication方法,对可能已经出故障或者甚至已经死掉的Application和Worker,进行清理
workers.filter(_.state == WorkerState.UNKNOWN).foreach(
removeWorker(_, "Not responding for recovery"))
apps.filter(_.state == ApplicationState.UNKNOWN).foreach(finishApplication)
}
清理出故障或者甚至已经死掉的worker
//分别从worker中去掉excutor,去掉driver,application和持久化中去掉worker
private def removeWorker(worker: WorkerInfo, msg: String) {
//将worker设为DEAD
worker.setState(WorkerState.DEAD)
//从worker缓存结构中移出
idToWorker -= worker.id
addressToWorker -= worker.endpoint.address
//遍历worker的executors,
for (exec <- worker.executors.values) {
logInfo("Telling app of lost executor: " + exec.id)
//并向executor对应的driver发送这个worker和executor丢掉的消息
exec.application.driver.send(ExecutorUpdated(
exec.id, ExecutorState.LOST, Some("worker lost"), None, workerLost = true))
exec.state = ExecutorState.LOST
//将worker上的每一个executor从内存缓存结构中移出
exec.application.removeExecutor(exec)
}
//遍历这个worker上的driver
for (driver <- worker.drivers.values) {
//DriverDescription中的supervise(优先级boolean),会让workdr去监视他,如果是true则重启此driver,否则移除
if (driver.desc.supervise) {
logInfo(s"Re-launching ${driver.id}")
relaunchDriver(driver)
} else {
logInfo(s"Not re-launching ${driver.id} because it was not supervised")
removeDriver(driver.id, DriverState.ERROR, None)
}
}
//spark中添加的,去掉application中正在运行的application中的这些worker
apps.filterNot(completedApps.contains(_)).foreach { app =>
app.driver.send(WorkerRemoved(worker.id, worker.host, msg))
}
//把持久化的信息移除掉
persistenceEngine.removeWorker(worker)
}
//重启Driver
private def relaunchDriver(driver: DriverInfo) {
//移除之前driver
removeDriver(driver.id, DriverState.RELAUNCHING, None)
val newDriver = createDriver(driver.desc)
//加入持久化
persistenceEngine.addDriver(newDriver)
//添加到缓存drivers中
drivers.add(newDriver)
//添加以driver等待队列
waitingDrivers += newDriver
schedule()
}
清理出故障或者甚至已经死掉的Application
//finishApplication即removeApplication
private def finishApplication(app: ApplicationInfo) {
removeApplication(app, ApplicationState.FINISHED)
}
//从内存缓冲中全部Application信息移除掉
def removeApplication(app: ApplicationInfo, state: ApplicationState.Value) {
//超过之前设置的上限个数
if (completedApps.size >= RETAINED_APPLICATIONS) {}
//Application中的executors都移除
for (exec <- app.executors.values) {
//给driver发送信息,此Application移除掉了
app.driver.send(ApplicationRemoved(state.toString))
}
//从缓存中移除
persistenceEngine.removeApplication(app)
schedule()
//告诉所有worker,ApplicationFinished
workers.foreach { w =>
w.endpoint.send(ApplicationFinished(app.id))
}
}
更新等待中的application状态
apps.filter(_.state == ApplicationState.WAITING).foreach(_.state = ApplicationState.RUNNING)
清理不属于任何worker的driver
//与去掉worker中的driver相似,只是开始过滤出的_.worker.isEmpty的driver,执行清理
drivers.filter(_.worker.isEmpty).foreach { d =>
logWarning(s"Driver ${d.id} was not found after master recovery")
if (d.desc.supervise) {
logWarning(s"Re-launching ${d.id}")
relaunchDriver(d)
} else {
removeDriver(d.id, DriverState.ERROR, None)
logWarning(s"Did not re-launch ${d.id} because it was not supervised")
}
}
最后schedule()
总结清理机制:
1、从内存缓存中移除;
2、从相关的组件的内存缓存中移除;
3、从持久化存储中移除