YarnClusterSchedulerBackend
主要负责 和 executors 通信,在 driver 的用户线程中工作,TaskSchedulerImpl 的 任务会通过 这个类及其父类 分发到executor。
//主要负责 和 executors 通信,在 driver 的用户线程中工作
private[spark] class YarnClusterSchedulerBackend(
scheduler: TaskSchedulerImpl, //持有 TaskSchedulerImpl 对象
sc: SparkContext)
extends YarnSchedulerBackend(scheduler, sc) {
//start 方法,在 TaskSchedulerImpl的start 方法中 被调用
override def start() {
val attemptId = ApplicationMaster.getAttemptId //获取 attemptID
bindToYarn(attemptId.getApplicationId(), Some(attemptId))
super.start()
totalExpectedExecutors = SchedulerBackendUtils.getInitialTargetExecutorNumber(sc.conf) //spark-submit --num-executors 指定的executor 的数量 ,不指定的话 默认 2个
}
// driver 的 log url
override def getDriverLogUrls: Option[Map[String, String]] = {
var driverLogs: Option[Map[String, String]] = None
try {
val yarnConf = new YarnConfiguration(sc.hadoopConfiguration)
val containerId = YarnSparkHadoopUtil.getContainerId
val httpAddress = System.getenv(Environment.NM_HOST.name()) +
":" + System.getenv(Environment.NM_HTTP_PORT.name())
// lookup appropriate http scheme for container log urls
val yarnHttpPolicy = yarnConf.get(
YarnConfiguration.YARN_HTTP_POLICY_KEY,
YarnConfiguration.YARN_HTTP_POLICY_DEFAULT
)
val user = Utils.getCurrentUserName()
val httpScheme = if (yarnHttpPolicy == "HTTPS_ONLY") "https://" else "http://"
val baseUrl = s"$httpScheme$httpAddress/node/containerlogs/$containerId/$user"
logDebug(s"Base URL for logs: $baseUrl")
driverLogs = Some(Map(
"stdout" -> s"$baseUrl/stdout?start=-4096",
"stderr" -> s"$baseUrl/stderr?start=-4096"))
} catch {
case e: Exception =>
logInfo("Error while building AM log links, so AM" +
" logs link will not appear in application UI", e)
}
driverLogs
}
}
YarnSchedulerBackend
/**
* 注意
* 这个 子类是 YarnClusterSchedulerBackend 和 YarnClientSchedulerBackend
* 所以 在 初始化 这2个子类的时候 会 初始化 YarnSchedulerBackend类
* driver install yarnSchedulerEndpointRef
* @param scheduler
* @param sc
*/
private[spark] abstract class YarnSchedulerBackend(
scheduler: TaskSchedulerImpl,
sc: SparkContext)
extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) {
private val stopped = new AtomicBoolean(false)
override val minRegisteredRatio = //最小注册到 driver的executor的比例 默认 0。8
if (conf.getOption("spark.scheduler.minRegisteredResourcesRatio").isEmpty) {
0.8
} else {
super.minRegisteredRatio
}
protected var totalExpectedExecutors = 0 //spark-submit --num-executors 指定的executor 的数量,不指定的话 默认 2个
private val yarnSchedulerEndpoint = new YarnSchedulerEndpoint(rpcEnv) //主要的作用是 driver用户线程 和 ApplicationMaster Spark AM的线程 交互
//driver install YarnSchedulerBackend endPoint
private val yarnSchedulerEndpointRef = rpcEnv.setupEndpoint(
YarnSchedulerBackend.ENDPOINT_NAME, yarnSchedulerEndpoint)
private implicit val askTimeout = RpcUtils.askRpcTimeout(sc.conf)
/** Application ID. */
protected var appId: Option[ApplicationId] = None
/** Attempt ID. This is unset for client-mode schedulers */
private var attemptId: Option[ApplicationAttemptId] = None
/** Scheduler extension services. */
private val services: SchedulerExtensionServices = new SchedulerExtensionServices() //SchedulerExtensionService 的 自己扩展服务,可以自己定制服务
/**
* Bind to YARN. This *must* be done before calling [[start()]].
*
* @param appId YARN application ID
* @param attemptId Optional YARN attempt ID
*/
protected def bindToYarn(appId: ApplicationId, attemptId: Option[ApplicationAttemptId]): Unit = {
this.appId = Some(appId)
this.attemptId = attemptId
}
override def start() {
require(appId.isDefined, "application ID unset")
val binding = SchedulerExtensionServiceBinding(sc, appId.get, attemptId)
services.start(binding) //SchedulerExtensionService 的 自己扩展服务
super.start()
}
override def stop(): Unit = {
try {
// SPARK-12009: To prevent Yarn allocator from requesting backup for the executors which
// was Stopped by SchedulerBackend.
requestTotalExecutors(0, 0, Map.empty)
super.stop()
} finally {
stopped.set(true)
services.stop()
}
}
/**
* Get the attempt ID for this run, if the cluster manager supports multiple
* attempts. Applications run in client mode will not have attempt IDs.
* This attempt ID only includes attempt counter, like "1", "2".
*
* @return The application attempt id, if available.
*/
override def applicationAttemptId(): Option[String] = {
attemptId.map(_.getAttemptId.toString)
}
/**
* Get an application ID associated with the job.
* This returns the string value of [[appId]] if set, otherwise
* the locally-generated ID from the superclass.
* @return The application ID
*/
override def applicationId(): String = {
appId.map(_.toString).getOrElse {
logWarning("Application ID is not initialized yet.")
super.applicationId
}
}
//申请 一定量的 executors,动态申请资源的时候 会使用到
private[cluster] def prepareRequestExecutors(requestedTotal: Int): RequestExecutors = {
val nodeBlacklist: Set[String] = scheduler.nodeBlacklist() //Spark 黑名单机制
// For locality preferences, ignore preferences for nodes that are blacklisted
val filteredHostToLocalTaskCount =
hostToLocalTaskCount.filter { case (k, v) => !nodeBlacklist.contains(k) }
RequestExecutors(requestedTotal, localityAwareTasks, filteredHostToLocalTaskCount,
nodeBlacklist)
}
/**
* Request executors from the ApplicationMaster by specifying the total number desired.
* This includes executors already pending or running.
*/
//这里使用 YarnSchedulerEndpoint endPoint 向 AM Yarn 申请新的 executor Container
override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = {
yarnSchedulerEndpointRef.ask[Boolean](prepareRequestExecutors(requestedTotal))
}
/**
* Request that the ApplicationMaster kill the specified executors.
*/
//让 yarn KillExecutors
override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = {
yarnSchedulerEndpointRef.ask[Boolean](KillExecutors(executorIds))
}
override def sufficientResourcesRegistered(): Boolean = {
//totalRegisteredExecutors = 总的 executor 注册executor
totalRegisteredExecutors.get() >= totalExpectedExecutors * minRegisteredRatio //spark-submot --nums-executos * 0。8
}
/**
* Add filters to the SparkUI.
*/
private def addWebUIFilter(
filterName: String,
filterParams: Map[String, String],
proxyBase: String): Unit = {
if (proxyBase != null && proxyBase.nonEmpty) {
System.setProperty("spark.ui.proxyBase", proxyBase)
}
val hasFilter =
filterName != null && filterName.nonEmpty &&
filterParams != null && filterParams.nonEmpty
if (hasFilter) {
logInfo(s"Add WebUI Filter. $filterName, $filterParams, $proxyBase")
conf.set("spark.ui.filters", filterName)
filterParams.foreach { case (k, v) => conf.set(s"spark.$filterName.param.$k", v) }
scheduler.sc.ui.foreach { ui => JettyUtils.addFilters(ui.getHandlers, conf) }
}
}
//重写的 YarnDriverEndpoint,目前没有使用到
override def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
new YarnDriverEndpoint(rpcEnv, properties)
}
/**
* Reset the state of SchedulerBackend to the initial state. This is happened when AM is failed
* and re-registered itself to driver after a failure. The stale state in driver should be
* cleaned.
*/
override protected def reset(): Unit = {
super.reset()
sc.executorAllocationManager.foreach(_.reset())
}
/**
* Override the DriverEndpoint to add extra logic for the case when an executor is disconnected.
* This endpoint communicates with the executors and queries the AM for an executor's exit
* status when the executor is disconnected.
*/
//重写 父类的 CoarseGrainedSchedulerBackend的内部类的 DriverEndpoint
private class YarnDriverEndpoint(rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
extends DriverEndpoint(rpcEnv, sparkProperties) {
/**
* When onDisconnected is received at the driver endpoint, the superclass DriverEndpoint
* handles it by assuming the Executor was lost for a bad reason and removes the executor
* immediately.
*
* In YARN's case however it is crucial to talk to the application master and ask why the
* executor had exited. If the executor exited for some reason unrelated to the running tasks
* (e.g., preemption), according to the application master, then we pass that information down
* to the TaskSetManager to inform the TaskSetManager that tasks on that lost executor should
* not count towards a job failure.
*/
override def onDisconnected(rpcAddress: RpcAddress): Unit = {
addressToExecutorId.get(rpcAddress).foreach { executorId =>
if (!stopped.get) {
if (disableExecutor(executorId)) {
yarnSchedulerEndpoint.handleExecutorDisconnectedFromDriver(executorId, rpcAddress)
}
}
}
}
}
/**
* An [[RpcEndpoint]] that communicates with the ApplicationMaster.
*/
//主要的作用是 driver用户线程 和 ApplicationMaster Spark AM的线程 交互
private class YarnSchedulerEndpoint(override val rpcEnv: RpcEnv)
extends ThreadSafeRpcEndpoint with Logging {
private var amEndpoint: Option[RpcEndpointRef] = None
private[YarnSchedulerBackend] def handleExecutorDisconnectedFromDriver(
executorId: String,
executorRpcAddress: RpcAddress): Unit = {
//获取 executor lost 的 原因
val removeExecutorMessage = amEndpoint match {
case Some(am) =>
val lossReasonRequest = GetExecutorLossReason(executorId)
am.ask[ExecutorLossReason](lossReasonRequest, askTimeout)
.map { reason => RemoveExecutor(executorId, reason) }(ThreadUtils.sameThread)
.recover {
case NonFatal(e) =>
logWarning(s"Attempted to get executor loss reason" +
s" for executor id ${executorId} at RPC address ${executorRpcAddress}," +
s" but got no response. Marking as slave lost.", e)
RemoveExecutor(executorId, SlaveLost())
}(ThreadUtils.sameThread)
case None =>
logWarning("Attempted to check for an executor loss reason" +
" before the AM has registered!")
Future.successful(RemoveExecutor(executorId, SlaveLost("AM is not yet registered.")))
}
removeExecutorMessage.foreach { message => driverEndpoint.send(message) }
}
override def receive: PartialFunction[Any, Unit] = {
case RegisterClusterManager(am) => //在 ApplicationMaster 的 AMEndpoint 中 start 方法中 会 driver.send(RegisterClusterManager(self))
logInfo(s"ApplicationMaster registered as $am")
amEndpoint = Option(am) //开始持有 AM主线程 的 AMEndpoint endPoint
reset()
case AddWebUIFilter(filterName, filterParams, proxyBase) =>
addWebUIFilter(filterName, filterParams, proxyBase)
case r @ RemoveExecutor(executorId, reason) =>
if (!stopped.get) {
logWarning(s"Requesting driver to remove executor $executorId for reason $reason")
driverEndpoint.send(r)
}
}
override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
case r: RequestExecutors => //向yarn 申请 Container , 响应在 AM 线程的 AMEndpoint, 在本类中的 line 150 的 doRequestTotalExecutors 方法中 请求了这个
amEndpoint match {
case Some(am) =>
am.ask[Boolean](r).andThen {
case Success(b) => context.reply(b)
case Failure(NonFatal(e)) =>
logError(s"Sending $r to AM was unsuccessful", e)
context.sendFailure(e)
}(ThreadUtils.sameThread)
case None =>
logWarning("Attempted to request executors before the AM has registered!")
context.reply(false)
}
case k: KillExecutors =>
amEndpoint match {
case Some(am) =>
am.ask[Boolean](k).andThen { //向 yarn 提交 KillExecutors 信息 ,响应在 AM 线程的 AMEndpoint
case Success(b) => context.reply(b)
case Failure(NonFatal(e)) =>
logError(s"Sending $k to AM was unsuccessful", e)
context.sendFailure(e)
}(ThreadUtils.sameThread)
case None =>
logWarning("Attempted to kill executors before the AM has registered!")
context.reply(false)
}
//响应YarnAllocator 获取 executor ID的值
case RetrieveLastAllocatedExecutorId =>
context.reply(currentExecutorIdCounter)//currentExecutorIdCounter = 0
}
override def onDisconnected(remoteAddress: RpcAddress): Unit = {
if (amEndpoint.exists(_.address == remoteAddress)) {
logWarning(s"ApplicationMaster has disassociated: $remoteAddress")
amEndpoint = None
}
}
}
}
private[spark] object YarnSchedulerBackend {
val ENDPOINT_NAME = "YarnScheduler"
}
CoarseGrainedSchedulerBackend
这个类是 YarnSchedulerBackend 的父类,所以在YarnSchedulerBackend 中会调用 此类 start 方法
这个类的内部 有一个 DriverEndpoint 负责 driver 的 Backend 与 executors 的通信,包括 分发任务,接收exec的任务运行结果通知,注册executor
//这个类是 YarnSchedulerBackend 的父类,所以在YarnSchedulerBackend 中会调用 此类 start 方法
//这个类的内部 有一个 DriverEndpoint 负责 driver 的 Backend 与 executors 的通信,包括 分发任务,接收exec的任务运行结果通知,注册executor
private[spark]
class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: RpcEnv)
extends ExecutorAllocationClient with SchedulerBackend with Logging {
// Use an atomic variable to track total number of cores in the cluster for simplicity and speed
protected val totalCoreCount = new AtomicInteger(0) //总核数 注册executor,removeExecutor 会更新
// Total number of executors that are currently registered
protected val totalRegisteredExecutors = new AtomicInteger(0) //总的 executor 注册executor,removeExecutor 会更新
protected val conf = scheduler.sc.conf
private val maxRpcMessageSize = RpcUtils.maxMessageSizeBytes(conf) //128M
private val defaultAskTimeout = RpcUtils.askRpcTimeout(conf) //spark.rpc.askTimeout 120s
// Submit tasks only after (registered resources / total expected resources)
// is equal to at least this value, that is double between 0 and 1.
private val _minRegisteredRatio =
math.min(1, conf.getDouble("spark.scheduler.minRegisteredResourcesRatio", 0))
// Submit tasks after maxRegisteredWaitingTime milliseconds
// if minRegisteredRatio has not yet been reached
private val maxRegisteredWaitingTimeMs = //最大 executor 注册等待时间
conf.getTimeAsMs("spark.scheduler.maxRegisteredResourcesWaitingTime", "30s")
private val createTime = System.currentTimeMillis()
// Accessing `executorDataMap` in `DriverEndpoint.receive/receiveAndReply` doesn't need any
// protection. But accessing `executorDataMap` out of `DriverEndpoint.receive/receiveAndReply`
// must be protected by `CoarseGrainedSchedulerBackend.this`. Besides, `executorDataMap` should
// only be modified in `DriverEndpoint.receive/receiveAndReply` with protection by
// `CoarseGrainedSchedulerBackend.this`.
//保存已经在driver注册过的 executor id 和 ExecutorData 键值对
private val executorDataMap = new HashMap[String, ExecutorData] //ExecutorData中有这个 executor 的 通信 引用
// Number of executors requested by the cluster manager, [[ExecutorAllocationManager]]
@GuardedBy("CoarseGrainedSchedulerBackend.this")
private var requestedTotalExecutors = 0
// Number of executors requested from the cluster manager that have not registered yet
@GuardedBy("CoarseGrainedSchedulerBackend.this")
private var numPendingExecutors = 0
private val listenerBus = scheduler.sc.listenerBus
// Executors we have requested the cluster manager to kill that have not died yet; maps
// the executor ID to whether it was explicitly killed by the driver (and thus shouldn't
// be considered an app-related failure).
@GuardedBy("CoarseGrainedSchedulerBackend.this")
private val executorsPendingToRemove = new HashMap[String, Boolean]
// A map to store hostname with its possible task number running on it
@GuardedBy("CoarseGrainedSchedulerBackend.this")
protected var hostToLocalTaskCount: Map[String, Int] = Map.empty
// The number of pending tasks which is locality required
@GuardedBy("CoarseGrainedSchedulerBackend.this")
protected var localityAwareTasks = 0
// The num of current max ExecutorId used to re-register appMaster
@volatile protected var currentExecutorIdCounter = 0
private val reviveThread = //receiver 线程,接收 本driver scheduler 的消息
ThreadUtils.newDaemonSingleThreadScheduledExecutor("driver-revive-thread")
//driver endPoint
class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
extends ThreadSafeRpcEndpoint with Logging {
// Executors that have been lost, but for which we don't yet know the real exit reason.
protected val executorsPendingLossReason = new HashSet[String]
//保存 注册的 executor RpcAddress和ID 消息
protected val addressToExecutorId = new HashMap[RpcAddress, String]
override def onStart() {
// Periodically revive offers to allow delay scheduling to work
val reviveIntervalMs = conf.getTimeAsMs("spark.scheduler.revive.interval", "1s")
//定时向自己发送 ReviveOffers 消息 ,响应在 自己的 receive 方法的 case ReviveOffers
reviveThread.scheduleAtFixedRate(new Runnable {
override def run(): Unit = Utils.tryLogNonFatalError {
Option(self).foreach(_.send(ReviveOffers))
}
}, 0, reviveIntervalMs, TimeUnit.MILLISECONDS)
}
//不需要 回复的消息
override def receive: PartialFunction[Any, Unit] = {
case StatusUpdate(executorId, taskId, state, data) => //executor 通知 driver 更新自己的状态 RUNNING的话
scheduler.statusUpdate(taskId, state, data.value) //更新 dirver 的 task 状态 ,会使用到 TaskSchedulerImpl 的 statusUpdate 方法
if (TaskState.isFinished(state)) {
executorDataMap.get(executorId) match {
case Some(executorInfo) =>
executorInfo.freeCores += scheduler.CPUS_PER_TASK
makeOffers(executorId)
case None =>
// Ignoring the update since we don't know about the executor.
logWarning(s"Ignored task status update ($taskId state $state) " +
s"from unknown executor with ID $executorId")
}
}
case ReviveOffers => //处理自己 定时的 ReviveOffers 消息,
makeOffers() //这里的意思 是 定时监测 task的存在来执行 task
// driver kill 某个 executorId 的 taskId
//响应 line 477 的 killTask 方法
case KillTask(taskId, executorId, interruptThread, reason) =>
executorDataMap.get(executorId) match {
case Some(executorInfo) =>
executorInfo.executorEndpoint.send(
KillTask(taskId, executorId, interruptThread, reason))
case None =>
// Ignoring the task kill since the executor is not registered.
logWarning(s"Attempted to kill task $taskId for unknown executor $executorId.")
}
case KillExecutorsOnHost(host) => //executor 请求 KillExecutorsOnHost
scheduler.getExecutorsAliveOnHost(host).foreach { exec =>
killExecutors(exec.toSeq, adjustTargetNumExecutors = false, countFailures = false,
force = true) //里面调用 doKillExecutors 方法,doKillExecutors 这个方法 YarnSchedulerBackend 会重写,重写的里面 使用 YarnDriverEndpoint
// 与 AM的 AMEndpoint 通信 ,通知 yarn Kill 这个 container
}
case UpdateDelegationTokens(newDelegationTokens) =>
executorDataMap.values.foreach { ed =>
ed.executorEndpoint.send(UpdateDelegationTokens(newDelegationTokens))
}
case RemoveExecutor(executorId, reason) =>
// We will remove the executor's state and cannot restore it. However, the connection
// between the driver and the executor may be still alive so that the executor won't exit
// automatically, so try to tell the executor to stop itself. See SPARK-13519.
executorDataMap.get(executorId).foreach(_.executorEndpoint.send(StopExecutor))
removeExecutor(executorId, reason)
}
//需要回复的消息
override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
/**
* 这里是 响应 executor (CoarseGrainedExecutorBackend 类 onStart 方法 )的 RegisterExecutor 的部分,
* 但是要注意 executorRef(NettyRpcEndpointRef)的 反序列化过程
*/
case RegisterExecutor(executorId, executorRef, hostname, cores, logUrls) =>
if (executorDataMap.contains(executorId)) {//检查这个 executor是否已经注册过了
//executorRef 这个ref 已经是 可以和executor 通信了,里面的NettyRpcEnv已经是 driver 节点的已经存在的了
//所以这个 ref 就可以直接当做driver的 ref 来使用了
//发送给 此 executor 注册失败消息 见CoarseGrainedExecutorBackend 的receive 方法的 case RegisterExecutorFailed 这个消息是发送即忘记类型的
executorRef.send(RegisterExecutorFailed("Duplicate executor ID: " + executorId))
//最后处理 executor 的异步消息 这里是要恢复的 因为本类的本方法是 receiveAndReply
context.reply(true)
} else if (scheduler.nodeBlacklist != null &&
scheduler.nodeBlacklist.contains(hostname)) {//这里是 机器的黑名单控制
// If the cluster manager gives us an executor on a blacklisted node (because it
// already started allocating those resources before we informed it of our blacklist,
// or if it ignored our blacklist), then we reject that executor immediately.
logInfo(s"Rejecting $executorId as it has been blacklisted.")
executorRef.send(RegisterExecutorFailed(s"Executor is blacklisted: $executorId"))
context.reply(true)
} else {
// If the executor's rpc env is not listening for incoming connections, `hostPort`
// will be null, and the client connection should be used to contact the executor.
val executorAddress = if (executorRef.address != null) {
executorRef.address
} else {
context.senderAddress
}
logInfo(s"Registered executor $executorRef ($executorAddress) with ID $executorId")
addressToExecutorId(executorAddress) = executorId //保存 注册的 executor RpcAddress和ID 消息
totalCoreCount.addAndGet(cores) //核数 统计
totalRegisteredExecutors.addAndGet(1) //注册的 executor 数 统计
val data = new ExecutorData(executorRef, executorAddress, hostname,
cores, cores, logUrls) //new ExecutorData 对象
// This must be synchronized because variables mutated
// in this block are read when requesting executors
CoarseGrainedSchedulerBackend.this.synchronized {
executorDataMap.put(executorId, data)
if (currentExecutorIdCounter < executorId.toInt) {
currentExecutorIdCounter = executorId.toInt
}
if (numPendingExecutors > 0) {
numPendingExecutors -= 1
logDebug(s"Decremented number of pending executors ($numPendingExecutors left)")
}
}
executorRef.send(RegisteredExecutor) //发送 executor 注册成功的 one-way message executor 的对应处理逻辑在 CoarseGrainedExecutorBackend receive 中
// Note: some tests expect the reply to come after we put the executor in the map
//最后处理 executor 的异步消息 这里是要恢复的 因为本类的本方法是 receiveAndReply
context.reply(true)
listenerBus.post(
SparkListenerExecutorAdded(System.currentTimeMillis(), executorId, data))
makeOffers()
}
case StopDriver =>
context.reply(true)
stop()
case StopExecutors =>
logInfo("Asking each executor to shut down")
for ((_, executorData) <- executorDataMap) {
executorData.executorEndpoint.send(StopExecutor) //通知 每个 executor StopExecutor
}
context.reply(true)
case RemoveWorker(workerId, host, message) =>
removeWorker(workerId, host, message)
context.reply(true)
//处理 executor的 启动时的 object CoarseGrainedExecutorBackend run 方法 的获取 SparkAppConfig
case RetrieveSparkAppConfig =>
val reply: SparkAppConfig = SparkAppConfig(
sparkProperties,
SparkEnv.get.securityManager.getIOEncryptionKey(),
fetchHadoopDelegationTokens())
context.reply(reply)
}
// Make fake resource offers on all executors
//获取所有 executor 的 空闲核数
//执行 任务 分发的入口函数
private def makeOffers() {
// Make sure no executor is killed while some task is launching on it
val taskDescs: Seq[Seq[TaskDescription]] = withLock {
// Filter out executors under killing
val activeExecutors: collection.Map[String, ExecutorData] = executorDataMap.filterKeys(executorIsAlive) //过滤到 存活的 executor
val workOffers = activeExecutors.map {//返回 存活的 executor 的 空闲 核数
case (id, executorData) =>
new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
}.toIndexedSeq
scheduler.resourceOffers(workOffers)//workOffers是 存活的 executor 的 空闲 核数,拿到 可能要运行的 TaskDescription,使用 TaskSchedulerImpl的resourceOffers方法 拿到任务的 TaskDescription
}
if (!taskDescs.isEmpty) { //如果有任务存在,则就会 launchTasks 这个 TaskDescription
launchTasks(taskDescs) //通知目标 executors 运行这些 TaskDescription
}
}
override def onDisconnected(remoteAddress: RpcAddress): Unit = {
addressToExecutorId
.get(remoteAddress)
.foreach(removeExecutor(_, SlaveLost("Remote RPC client disassociated. Likely due to " +
"containers exceeding thresholds, or network issues. Check driver logs for WARN " +
"messages.")))
}
// Make fake resource offers on just one executor
private def makeOffers(executorId: String) {
// Make sure no executor is killed while some task is launching on it
val taskDescs = withLock {
// Filter out executors under killing
if (executorIsAlive(executorId)) {
val executorData = executorDataMap(executorId)
val workOffers = IndexedSeq(
new WorkerOffer(executorId, executorData.executorHost, executorData.freeCores))
scheduler.resourceOffers(workOffers)
} else {
Seq.empty
}
}
if (!taskDescs.isEmpty) {
launchTasks(taskDescs)
}
}
private def executorIsAlive(executorId: String): Boolean = synchronized {
!executorsPendingToRemove.contains(executorId) &&
!executorsPendingLossReason.contains(executorId)
}
// Launch tasks returned by a set of resource offers
//通知目标 executors 运行这些 TaskDescription
private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
for (task <- tasks.flatten) {//拍平 Seq[Seq[TaskDescription]]
val serializedTask = TaskDescription.encode(task) //序列化 TaskDescription
if (serializedTask.limit() >= maxRpcMessageSize) { //序列化的 长度超过 RPC Msg的限制,则会提示任务失败
Option(scheduler.taskIdToTaskSetManager.get(task.taskId)).foreach { taskSetMgr =>
try {
var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
"spark.rpc.message.maxSize (%d bytes). Consider increasing " +
"spark.rpc.message.maxSize or using broadcast variables for large values."
msg = msg.format(task.taskId, task.index, serializedTask.limit(), maxRpcMessageSize)
taskSetMgr.abort(msg)
} catch {
case e: Exception => logError("Exception in error callback", e)
}
}
}
else {
val executorData = executorDataMap(task.executorId) //拿到 目标的 executorData
executorData.freeCores -= scheduler.CPUS_PER_TASK //更新 目标的 executorData的可用核数
logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +
s"${executorData.executorHost}.")
executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask))) //发送这个 executor 运行 LaunchTask 的消息
}
}
}
// Remove a disconnected slave from the cluster
private def removeExecutor(executorId: String, reason: ExecutorLossReason): Unit = {
logDebug(s"Asked to remove executor $executorId with reason $reason")
executorDataMap.get(executorId) match {
case Some(executorInfo) =>
// This must be synchronized because variables mutated
// in this block are read when requesting executors
val killed = CoarseGrainedSchedulerBackend.this.synchronized {
addressToExecutorId -= executorInfo.executorAddress
executorDataMap -= executorId
executorsPendingLossReason -= executorId
executorsPendingToRemove.remove(executorId).getOrElse(false)
}
totalCoreCount.addAndGet(-executorInfo.totalCores)
totalRegisteredExecutors.addAndGet(-1)
scheduler.executorLost(executorId, if (killed) ExecutorKilled else reason)
listenerBus.post(
SparkListenerExecutorRemoved(System.currentTimeMillis(), executorId, reason.toString))
case None =>
// SPARK-15262: If an executor is still alive even after the scheduler has removed
// its metadata, we may receive a heartbeat from that executor and tell its block
// manager to reregister itself. If that happens, the block manager master will know
// about the executor, but the scheduler will not. Therefore, we should remove the
// executor from the block manager when we hit this case.
scheduler.sc.env.blockManager.master.removeExecutorAsync(executorId)
logInfo(s"Asked to remove non-existent executor $executorId")
}
}
// Remove a lost worker from the cluster
private def removeWorker(workerId: String, host: String, message: String): Unit = {
logDebug(s"Asked to remove worker $workerId with reason $message")
scheduler.workerRemoved(workerId, host, message)
}
/**
* Stop making resource offers for the given executor. The executor is marked as lost with
* the loss reason still pending.
*
* @return Whether executor should be disabled
*/
protected def disableExecutor(executorId: String): Boolean = {
val shouldDisable = CoarseGrainedSchedulerBackend.this.synchronized {
if (executorIsAlive(executorId)) {
executorsPendingLossReason += executorId
true
} else {
// Returns true for explicitly killed executors, we also need to get pending loss reasons;
// For others return false.
executorsPendingToRemove.contains(executorId)
}
}
if (shouldDisable) {
logInfo(s"Disabling executor $executorId.")
scheduler.executorLost(executorId, LossReasonPending)
}
shouldDisable
}
}
var driverEndpoint: RpcEndpointRef = null //CoarseGrainedScheduler
protected def minRegisteredRatio: Double = _minRegisteredRatio //默认 0
override def start() { //SparkContext 的 _taskScheduler.start() 方法 最终会调用 这个 start 方法
val properties = new ArrayBuffer[(String, String)]
for ((key, value) <- scheduler.sc.conf.getAll) {
if (key.startsWith("spark.")) {
properties += ((key, value))
}
}
// TODO (prashant) send conf instead of properties
//driver install name = CoarseGrainedExecutorBackend 的 DriverEndpoint
driverEndpoint = createDriverEndpointRef(properties)
}
//driver install name = CoarseGrainedExecutorBackend 的 DriverEndpoint
protected def createDriverEndpointRef(
properties: ArrayBuffer[(String, String)]): RpcEndpointRef = {
//ENDPOINT_NAME = CoarseGrainedScheduler
//这个ref 在 executor object CoarseGrainedExecutorBackend 的run 方法 会使用到的
rpcEnv.setupEndpoint(ENDPOINT_NAME, createDriverEndpoint(properties))
}
//new DriverEndpoint 这个 内部类
protected def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
new DriverEndpoint(rpcEnv, properties)
}
//使用 driver 安装过的 driverEndpoint(DriverEndpoint) stop 所有的 Executors
def stopExecutors() {
try {
if (driverEndpoint != null) { //通知 driverEndpoint StopExecutors
logInfo("Shutting down all executors")
driverEndpoint.askSync[Boolean](StopExecutors) //响应在 DriverEndpoint 这个内部类的receiveAndReply方法中
}
} catch {
case e: Exception =>
throw new SparkException("Error asking standalone scheduler to shut down executors", e)
}
}
override def stop() {
reviveThread.shutdownNow()
stopExecutors()
try {
if (driverEndpoint != null) {
driverEndpoint.askSync[Boolean](StopDriver)
}
} catch {
case e: Exception =>
throw new SparkException("Error stopping standalone scheduler's driver endpoint", e)
}
}
/**
* Reset the state of CoarseGrainedSchedulerBackend to the initial state. Currently it will only
* be called in the yarn-client mode when AM re-registers after a failure.
* */
protected def reset(): Unit = {
val executors: Set[String] = synchronized {
requestedTotalExecutors = 0
numPendingExecutors = 0
executorsPendingToRemove.clear()
executorDataMap.keys.toSet
}
// Remove all the lingering executors that should be removed but not yet. The reason might be
// because (1) disconnected event is not yet received; (2) executors die silently.
executors.foreach { eid =>
removeExecutor(eid, SlaveLost("Stale executor after cluster manager re-registered."))
}
}
//当TaskScheduler 的定时任务 发现有 可推测任务可以执行的时候会调用这个方法
//当TaskScheduler 提交 tasks后,也会调用这个方法
override def reviveOffers() {
driverEndpoint.send(ReviveOffers)//这里然后会 拿到对应的task的TaskDescript ,来通知executor 执行tasks
}
//dirver 发送 killTask的消息,需要 Backend 的 receiver 响应
//这个方法 会在 TaskSchedulerImpl 的 cancelTasks 和 killTaskAttempt 方法中 调用 用来 kill 某个 executor的 task
override def killTask(
taskId: Long, executorId: String, interruptThread: Boolean, reason: String) {
driverEndpoint.send(KillTask(taskId, executorId, interruptThread, reason))
}
override def defaultParallelism(): Int = {
conf.getInt("spark.default.parallelism", math.max(totalCoreCount.get(), 2))
}
/**
* Called by subclasses when notified of a lost worker. It just fires the message and returns
* at once.
*/
//本类的 reset 方法中 有使用到
protected def removeExecutor(executorId: String, reason: ExecutorLossReason): Unit = {
driverEndpoint.send(RemoveExecutor(executorId, reason))
}
protected def removeWorker(workerId: String, host: String, message: String): Unit = {
driverEndpoint.ask[Boolean](RemoveWorker(workerId, host, message)).failed.foreach(t =>
logError(t.getMessage, t))(ThreadUtils.sameThread)
}
def sufficientResourcesRegistered(): Boolean = true
//是否 准备就绪,满足 已经注册的executors的数量 是否已经满足了 spark-submot --nums-executos * 0。8 的阈值 或者 超过了 maxRegisteredWaitingTimeMs
//这个方法在 TaslSchedulerImpl 中使用,判断 这个backend 是否已经 ready
override def isReady(): Boolean = {
if (sufficientResourcesRegistered) { //这个方法在 子类 YarnSchedulerBackend 中,主要最用是表示 已经注册的executors的数量 是否已经满足了 spark-submot --nums-executos * 0。8 的阈值
logInfo("SchedulerBackend is ready for scheduling beginning after " +
s"reached minRegisteredResourcesRatio: $minRegisteredRatio")
return true
}
if ((System.currentTimeMillis() - createTime) >= maxRegisteredWaitingTimeMs) {
logInfo("SchedulerBackend is ready for scheduling beginning after waiting " +
s"maxRegisteredResourcesWaitingTime: $maxRegisteredWaitingTimeMs(ms)")
return true
}
false
}
/**
* Return the number of executors currently registered with this backend.
*/
private def numExistingExecutors: Int = executorDataMap.size
override def getExecutorIds(): Seq[String] = {
executorDataMap.keySet.toSeq
}
/**
* Request an additional number of executors from the cluster manager.
* @return whether the request is acknowledged.
*/
//增加 executor,SparkContext中 有使用到这个方法
final override def requestExecutors(numAdditionalExecutors: Int): Boolean = {
if (numAdditionalExecutors < 0) {
throw new IllegalArgumentException(
"Attempted to request a negative number of additional executor(s) " +
s"$numAdditionalExecutors from the cluster manager. Please specify a positive number!")
}
logInfo(s"Requesting $numAdditionalExecutors additional executor(s) from the cluster manager")
val response = synchronized {
requestedTotalExecutors += numAdditionalExecutors
numPendingExecutors += numAdditionalExecutors
logDebug(s"Number of pending executors is now $numPendingExecutors")
if (requestedTotalExecutors !=
(numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)) {
logDebug(
s"""requestExecutors($numAdditionalExecutors): Executor request doesn't match:
|requestedTotalExecutors = $requestedTotalExecutors
|numExistingExecutors = $numExistingExecutors
|numPendingExecutors = $numPendingExecutors
|executorsPendingToRemove = ${executorsPendingToRemove.size}""".stripMargin)
}
// Account for executors pending to be added or removed
doRequestTotalExecutors(requestedTotalExecutors)
}
defaultAskTimeout.awaitResult(response)
}
/**
* Update the cluster manager on our scheduling needs. Three bits of information are included
* to help it make decisions.
* @param numExecutors The total number of executors we'd like to have. The cluster manager
* shouldn't kill any running executor to reach this number, but,
* if all existing executors were to die, this is the number of executors
* we'd want to be allocated.
* @param localityAwareTasks The number of tasks in all active stages that have a locality
* preferences. This includes running, pending, and completed tasks.
* @param hostToLocalTaskCount A map of hosts to the number of tasks from all active stages
* that would like to like to run on that host.
* This includes running, pending, and completed tasks.
* @return whether the request is acknowledged by the cluster manager.
*/
//向 yarn 申请 executors
final override def requestTotalExecutors(
numExecutors: Int,
localityAwareTasks: Int,
hostToLocalTaskCount: Map[String, Int]
): Boolean = {
if (numExecutors < 0) {
throw new IllegalArgumentException(
"Attempted to request a negative number of executor(s) " +
s"$numExecutors from the cluster manager. Please specify a positive number!")
}
val response = synchronized {
this.requestedTotalExecutors = numExecutors
this.localityAwareTasks = localityAwareTasks
this.hostToLocalTaskCount = hostToLocalTaskCount
numPendingExecutors =
math.max(numExecutors - numExistingExecutors + executorsPendingToRemove.size, 0)
doRequestTotalExecutors(numExecutors)
}
defaultAskTimeout.awaitResult(response)
}
/**
* Request executors from the cluster manager by specifying the total number desired,
* including existing pending and running executors.
*
* The semantics here guarantee that we do not over-allocate executors for this application,
* since a later request overrides the value of any prior request. The alternative interface
* of requesting a delta of executors risks double counting new executors when there are
* insufficient resources to satisfy the first request. We make the assumption here that the
* cluster manager will eventually fulfill all requests when resources free up.
*
* @return a future whose evaluation indicates whether the request is acknowledged.
*/
protected def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] =
Future.successful(false)
/**
* Request that the cluster manager kill the specified executors.
*
* @param executorIds identifiers of executors to kill
* @param adjustTargetNumExecutors whether the target number of executors be adjusted down
* after these executors have been killed
* @param countFailures if there are tasks running on the executors when they are killed, whether
* those failures be counted to task failure limits?
* @param force whether to force kill busy executors, default false
* @return the ids of the executors acknowledged by the cluster manager to be removed.
*/
final override def killExecutors(
executorIds: Seq[String],
adjustTargetNumExecutors: Boolean,
countFailures: Boolean,
force: Boolean): Seq[String] = {
logInfo(s"Requesting to kill executor(s) ${executorIds.mkString(", ")}")
val response = withLock {
val (knownExecutors, unknownExecutors) = executorIds.partition(executorDataMap.contains)
unknownExecutors.foreach { id =>
logWarning(s"Executor to kill $id does not exist!")
}
// If an executor is already pending to be removed, do not kill it again (SPARK-9795)
// If this executor is busy, do not kill it unless we are told to force kill it (SPARK-9552)
val executorsToKill = knownExecutors
.filter { id => !executorsPendingToRemove.contains(id) }
.filter { id => force || !scheduler.isExecutorBusy(id) }
executorsToKill.foreach { id => executorsPendingToRemove(id) = !countFailures }
logInfo(s"Actual list of executor(s) to be killed is ${executorsToKill.mkString(", ")}")
// If we do not wish to replace the executors we kill, sync the target number of executors
// with the cluster manager to avoid allocating new ones. When computing the new target,
// take into account executors that are pending to be added or removed.
val adjustTotalExecutors =
if (adjustTargetNumExecutors) {
requestedTotalExecutors = math.max(requestedTotalExecutors - executorsToKill.size, 0)
if (requestedTotalExecutors !=
(numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)) {
logDebug(
s"""killExecutors($executorIds, $adjustTargetNumExecutors, $countFailures, $force):
|Executor counts do not match:
|requestedTotalExecutors = $requestedTotalExecutors
|numExistingExecutors = $numExistingExecutors
|numPendingExecutors = $numPendingExecutors
|executorsPendingToRemove = ${executorsPendingToRemove.size}""".stripMargin)
}
doRequestTotalExecutors(requestedTotalExecutors)
} else {
numPendingExecutors += knownExecutors.size
Future.successful(true)
}
val killExecutors: Boolean => Future[Boolean] =
if (!executorsToKill.isEmpty) {
_ => doKillExecutors(executorsToKill)
} else {
_ => Future.successful(false)
}
val killResponse = adjustTotalExecutors.flatMap(killExecutors)(ThreadUtils.sameThread)
killResponse.flatMap(killSuccessful =>
Future.successful (if (killSuccessful) executorsToKill else Seq.empty[String])
)(ThreadUtils.sameThread)
}
defaultAskTimeout.awaitResult(response)
}
/**
* Kill the given list of executors through the cluster manager.
* @return whether the kill request is acknowledged.
*/
protected def doKillExecutors(executorIds: Seq[String]): Future[Boolean] =
Future.successful(false)
/**
* Request that the cluster manager kill all executors on a given host.
* @return whether the kill request is acknowledged.
*/
final override def killExecutorsOnHost(host: String): Boolean = {
logInfo(s"Requesting to kill any and all executors on host ${host}")
// A potential race exists if a new executor attempts to register on a host
// that is on the blacklist and is no no longer valid. To avoid this race,
// all executor registration and killing happens in the event loop. This way, either
// an executor will fail to register, or will be killed when all executors on a host
// are killed.
// Kill all the executors on this host in an event loop to ensure serialization.
driverEndpoint.send(KillExecutorsOnHost(host))
true
}
protected def fetchHadoopDelegationTokens(): Option[Array[Byte]] = { None }
// SPARK-27112: We need to ensure that there is ordering of lock acquisition
// between TaskSchedulerImpl and CoarseGrainedSchedulerBackend objects in order to fix
// the deadlock issue exposed in SPARK-27112
private def withLock[T](fn: => T): T = scheduler.synchronized {
CoarseGrainedSchedulerBackend.this.synchronized { fn }
}
}
private[spark] object CoarseGrainedSchedulerBackend {
val ENDPOINT_NAME = "CoarseGrainedScheduler"
}