Spark-HeartbeatReceiver 源码解析
HeartbeatReceiver这个类是一个 endPoint,在driver端才有其对象。他的主要作用是 定时监测 注册到 本dirver的所有的executor 是否存活。
下面来看看源码:
class HeartbeatReceiver
private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)
extends SparkListener with ThreadSafeRpcEndpoint with Logging {
def this(sc: SparkContext) {
this(sc, new SystemClock)
}
sc.listenerBus.addToManagementQueue(this)
override val rpcEnv: RpcEnv = sc.env.rpcEnv
private[spark] var scheduler: TaskScheduler = null
// executor ID -> timestamp of when the last heartbeat from this executor was received
private val executorLastSeen = new mutable.HashMap[String, Long] //保存 executor 的 最新 心跳时间
// "spark.network.timeout" uses "seconds", while `spark.storage.blockManagerSlaveTimeoutMs` uses
// "milliseconds"
private val slaveTimeoutMs =
sc.conf.getTimeAsMs("spark.storage.blockManagerSlaveTimeoutMs", "120s") //blockManager endPoint 超时时间
private val executorTimeoutMs =
sc.conf.getTimeAsSeconds("spark.network.timeout", s"${slaveTimeoutMs}ms") * 1000 //executor 超时时间
// "spark.network.timeoutInterval" uses "seconds", while
// "spark.storage.blockManagerTimeoutIntervalMs" uses "milliseconds"
private val timeoutIntervalMs =
sc.conf.getTimeAsMs("spark.storage.blockManagerTimeoutIntervalMs", "60s") //blockManager 超时时间
private val checkTimeoutIntervalMs =
sc.conf.getTimeAsSeconds("spark.network.timeoutInterval", s"${timeoutIntervalMs}ms") * 1000
private var timeoutCheckingTask: ScheduledFuture[_] = null
// "eventLoopThread" is used to run some pretty fast actions. The actions running in it should not
// block the thread for a long time.
private val eventLoopThread =
ThreadUtils.newDaemonSingleThreadScheduledExecutor("heartbeat-receiver-event-loop-thread") //driver 检查 executor心跳 线程
private val killExecutorThread = ThreadUtils.newDaemonSingleThreadExecutor("kill-executor-thread") //处理 exector 心跳失败 kill 的线程
override def onStart(): Unit = { //启动 ExpireDeadHosts 定时监测
timeoutCheckingTask = eventLoopThread.scheduleAtFixedRate(new Runnable {
override def run(): Unit = Utils.tryLogNonFatalError {
Option(self).foreach(_.ask[Boolean](ExpireDeadHosts)) //处理在 receiveAndReply 的 case ExpireDeadHosts中
}
}, 0, checkTimeoutIntervalMs, TimeUnit.MILLISECONDS)
}
override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
// Messages sent and received locally
case ExecutorRegistered(executorId) => //增加一个executor
executorLastSeen(executorId) = clock.getTimeMillis() //更新 executor 的 最新 心跳时间
context.reply(true)
case ExecutorRemoved(executorId) => //溢出一个executor
executorLastSeen.remove(executorId) //更新 executor 的 最新 心跳时间
context.reply(true)
case TaskSchedulerIsSet =>
scheduler = sc.taskScheduler //为本对象 设置 taskscheduler属性
context.reply(true)
case ExpireDeadHosts => //检查 executor 是否超时
expireDeadHosts() //检查 executor 是否超时 ,和超时后的处理方法
context.reply(true)
// Messages received from executors 处理 executor 的 心跳信息
case heartbeat @ Heartbeat(executorId, accumUpdates, blockManagerId) =>
if (scheduler != null) { //taskscheduler 会在 SparkContext 后面 创建 _heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet) 这句 就是对应的操作
if (executorLastSeen.contains(executorId)) { //原来已经汇报过了
executorLastSeen(executorId) = clock.getTimeMillis() //更新 executor 的 最新 心跳时间
eventLoopThread.submit(new Runnable {
override def run(): Unit = Utils.tryLogNonFatalError {
val unknownExecutor: Boolean = !scheduler.executorHeartbeatReceived( //unknownExecutor这个如果是 true的话,则会在 executor 中 重新注册blockManager
// 可以看org.apache.spark.executor.Executor的reportHeartBeat方法
executorId, accumUpdates, blockManagerId)
//如果 driver的BlockManagerMasterEndPoint 中已经注册过了 这个 blockManagerId,则返回false,则不需要executor 再次 重新注册 BlockManager
val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor)
context.reply(response)
}
})
} else {//还没有 在 driver 注册过
// This may happen if we get an executor's in-flight heartbeat immediately
// after we just removed it. It's not really an error condition so we should
// not log warning here. Otherwise there may be a lot of noise especially if
// we explicitly remove executors (SPARK-4134).
logDebug(s"Received heartbeat from unknown executor $executorId")
context.reply(HeartbeatResponse(reregisterBlockManager = true)) //需要 executor 重新注册 BlockManager
}
} else {// TaskScheduler 还没有 在 此类中还没有完成 持有化
// Because Executor will sleep several seconds before sending the first "Heartbeat", this
// case rarely happens. However, if it really happens, log it and ask the executor to
// register itself again.
logWarning(s"Dropping $heartbeat because TaskScheduler is not ready yet")
context.reply(HeartbeatResponse(reregisterBlockManager = true))
}
}
/**
* Send ExecutorRegistered to the event loop to add a new executor. Only for test.
*
* @return if HeartbeatReceiver is stopped, return None. Otherwise, return a Some(Future) that
* indicate if this operation is successful.
*/
//add 一个executor
def addExecutor(executorId: String): Option[Future[Boolean]] = {
Option(self).map(_.ask[Boolean](ExecutorRegistered(executorId)))
}
/**
* If the heartbeat receiver is not stopped, notify it of executor registrations.
*/
//add 一个executor
override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = {
addExecutor(executorAdded.executorId)
}
/**
* Send ExecutorRemoved to the event loop to remove an executor. Only for test.
*
* @return if HeartbeatReceiver is stopped, return None. Otherwise, return a Some(Future) that
* indicate if this operation is successful.
*/
//移除一个executor
def removeExecutor(executorId: String): Option[Future[Boolean]] = {
Option(self).map(_.ask[Boolean](ExecutorRemoved(executorId)))
}
/**
* If the heartbeat receiver is not stopped, notify it of executor removals so it doesn't
* log superfluous errors.
*
* Note that we must do this after the executor is actually removed to guard against the
* following race condition: if we remove an executor's metadata from our data structure
* prematurely, we may get an in-flight heartbeat from the executor before the executor is
* actually removed, in which case we will still mark the executor as a dead host later
* and expire it with loud error messages.
*/
//移除一个executor
override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = {
removeExecutor(executorRemoved.executorId)
}
//检查 executor 是否超时 ,和超时后的处理方法
private def expireDeadHosts(): Unit = {
logTrace("Checking for hosts with no recent heartbeats in HeartbeatReceiver.")
val now = clock.getTimeMillis()
for ((executorId, lastSeenMs) <- executorLastSeen) { // executorLastSeen 是保存 executor 的 最新 心跳时间
if (now - lastSeenMs > executorTimeoutMs) {//executor 超过 超时时间设置
logWarning(s"Removing executor $executorId with no recent heartbeats: " +
s"${now - lastSeenMs} ms exceeds timeout $executorTimeoutMs ms")
//taskScheduler 执行 executor lost 相应的 操作
scheduler.executorLost(executorId, SlaveLost("Executor heartbeat " +
s"timed out after ${now - lastSeenMs} ms"))
// Asynchronously kill the executor to avoid blocking the current thread
killExecutorThread.submit(new Runnable {// kill executor 线程 单独处理 kill 和 replace 这个 executor
override def run(): Unit = Utils.tryLogNonFatalError {
// Note: we want to get an executor back after expiring this one,
// so do not simply call `sc.killExecutor` here (SPARK-8119)
sc.killAndReplaceExecutor(executorId) //schedulerBackend 的处理方法
}
})
executorLastSeen.remove(executorId) //executorLastSeen 是保存 executor 的 最新 心跳时间 移除这个 executor
}
}
}
override def onStop(): Unit = {
if (timeoutCheckingTask != null) {
timeoutCheckingTask.cancel(true)
}
eventLoopThread.shutdownNow()
killExecutorThread.shutdownNow()
}
}
object HeartbeatReceiver
private[spark] object HeartbeatReceiver {
val ENDPOINT_NAME = "HeartbeatReceiver"
}