Worker在启动之后,就会主动向Master进行注册
用Spark-submit提交spark Application的时候,首先就会注册Driver
Driver启动好了,执行我们编写的Application代码,执行SparkContext初始化,底层的SparkDeploySchedulerBackend,会通过AppClient内部的线程,ClientActor发送RegisterApplication,到Master,进行Application的注册
流程图
注册机制原理剖析.png
源码
Driver注册
case RequestSubmitDriver(description) => {
if (state != RecoveryState.ALIVE) {
val msg = s"Can only accept driver submissions in ALIVE state. Current state: $state."
sender ! SubmitDriverResponse(false, None, msg)
} else {
logInfo("Driver submitted " + description.command.mainClass)
// 创建Driver
val driver = createDriver(description)
// 持久化引擎持久化
persistenceEngine.addDriver(driver)
// 加入等待调度队列
waitingDrivers += driver
// 加入内存缓冲中
drivers.add(driver)
// 调用schedule()
schedule()
// TODO: It might be good to instead have the submission client poll the master to determine
// the current status of the driver. For now it's simply "fire and forget".
sender ! SubmitDriverResponse(true, Some(driver.id),
s"Driver successfully submitted as ${driver.id}")
}
}
Application注册
case RegisterApplication(description) => {
if (state == RecoveryState.STANDBY) {
// ignore, don't send response
} else {
logInfo("Registering app " + description.name)
// 创建Application
val app = createApplication(description, sender)
// 注册Application
registerApplication(app)
logInfo("Registered app " + description.name + " with ID " + app.id)
// 持久化引擎保存
persistenceEngine.addApplication(app)
sender ! RegisteredApplication(app.id, masterUrl)
// 调用schedule()方法
schedule()
}
}
看下 registerApplication()方法
// 注册Application
def registerApplication(app: ApplicationInfo): Unit = {
val appAddress = app.driver.path.address
if (addressToApp.contains(appAddress)) {
logInfo("Attempted to re-register application at same address: " + appAddress)
return
}
//spark测量系统通注册appsource
applicationMetricsSystem.registerSource(app.appSource)
//将APP加入内存缓存中
apps += app
idToApp(app.id) = app
actorToApp(app.driver) = app
addressToApp(appAddress) = app
//等待调度的队列
waitingApps += app
}
Worker注册
case RegisterWorker(id, workerHost, workerPort, cores, memory, workerUiPort, publicAddress) =>
{
logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
workerHost, workerPort, cores, Utils.megabytesToString(memory)))
// STANDBY模式的master 不发送任何回应
if (state == RecoveryState.STANDBY) {
// ignore, don't send response
} else if (idToWorker.contains(id)) {
// 改Worker已经注册
sender ! RegisterWorkerFailed("Duplicate worker ID")
} else {
// 创建Worker
val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory,
sender, workerUiPort, publicAddress)
// 注册Worker
if (registerWorker(worker)) {
// 持久化引擎进行持久化
persistenceEngine.addWorker(worker)
sender ! RegisteredWorker(masterUrl, masterWebUiUrl)
// 调用schedule()
schedule()
} else {
val workerAddress = worker.actor.path.address
logWarning("Worker registration failed. Attempted to re-register worker at same " +
"address: " + workerAddress)
sender ! RegisterWorkerFailed("Attempted to re-register worker at same address: "
+ workerAddress)
}
}
}
看下registerWorker()
def registerWorker(worker: WorkerInfo): Boolean = {
// There may be one or more refs to dead workers on this same node (w/ different ID's),
// remove them.
//在同一个节点上可能有一个或多个死掉的worker(不同ID),删除它们。
workers.filter { w =>
(w.host == worker.host && w.port == worker.port) && (w.state == WorkerState.DEAD)
}.foreach { w =>
workers -= w
}
val workerAddress = worker.actor.path.address
if (addressToWorker.contains(workerAddress)) {
val oldWorker = addressToWorker(workerAddress)
if (oldWorker.state == WorkerState.UNKNOWN) {
// A worker registering from UNKNOWN implies that the worker was restarted during recovery.
// The old worker must thus be dead, so we will remove it and accept the new worker.
//从UNKNOWN注册的worker意味着worker在恢复期间重新启动。
//因此,老worker必须死亡,所以我们会把它删除并接受新的worker。
removeWorker(oldWorker)
} else {
logInfo("Attempted to re-register worker at same address: " + workerAddress)
return false
}
}
//保存workerInfo到wokers(hashmap)中
workers += worker
//保存worker的id到idToWorker(hashmap)中
idToWorker(worker.id) = worker
//将work端点的地址保存起来
addressToWorker(workerAddress) = worker
true
}
看下removeWorker方法
def removeWorker(worker: WorkerInfo) {
logInfo("Removing worker " + worker.id + " on " + worker.host + ":" + worker.port)
//将work状态修改为dead
worker.setState(WorkerState.DEAD)
//从idToWorker(hashmap)中去掉workid,
idToWorker -= worker.id
//从addressToWorker(hashmap)中去掉worker.endpoint.address
addressToWorker -= worker.actor.path.address
for (exec <- worker.executors.values) {
logInfo("Telling app of lost executor: " + exec.id)
//向driver中发送executor状态改变
exec.application.driver ! ExecutorUpdated(
exec.id, ExecutorState.LOST, Some("worker lost"), None)
//从application中删除掉这些executor
exec.application.removeExecutor(exec)
}
for (driver <- worker.drivers.values) {
if (driver.desc.supervise) {
logInfo(s"Re-launching ${driver.id}")
//重新启动
relaunchDriver(driver)
} else {
logInfo(s"Not re-launching ${driver.id} because it was not supervised")
//删除driver
removeDriver(driver.id, DriverState.ERROR, None)
}
}
//持久化引擎删除worker
persistenceEngine.removeWorker(worker)
}
接着看下removeDriver()和relaunchDriver()方法
// 重新启动driver
def relaunchDriver(driver: DriverInfo) {
//将driver的worker设置为None
driver.worker = None
//将driver的状态设置为relaunching(重新调度)
driver.state = DriverState.RELAUNCHING
//将当前的driver重新加入waitingDrivers队列
waitingDrivers += driver
//重新开始任务调度
schedule()
}
// 删除driver
def removeDriver(driverId: String, finalState: DriverState, exception: Option[Exception]) {
//用Scala高阶函数find()根据driverId,查找到driver
drivers.find(d => d.id == driverId) match {
case Some(driver) =>
logInfo(s"Removing driver: $driverId")
//将driver将内存缓存中删除
drivers -= driver
if (completedDrivers.size >= RETAINED_DRIVERS) {
val toRemove = math.max(RETAINED_DRIVERS / 10, 1)
completedDrivers.trimStart(toRemove)
}
//将driver加入到已经完成的completeDrivers
completedDrivers += driver
//从持久化引擎中删除driver
persistenceEngine.removeDriver(driver)
//设置driver状态设置为完成
driver.state = finalState
driver.exception = exception
//从worker中遍历删除传入的driver
driver.worker.foreach(w => w.removeDriver(driver))
//重新调用schedule
schedule()
case None =>
logWarning(s"Asked to remove unknown driver: $driverId")
}
}