概述
本文基于spark-yarn 2.4与yarn3.2分析registerApplicationMaster的整个流程
spark-yarn部分
spark-yarn运行driver
private def runDriver(): Unit = {
addAmIpFilter(None)
userClassThread = startUserApplication()
// This a bit hacky, but we need to wait until the spark.driver.port property has
// been set by the Thread executing the user class.
logInfo("Waiting for spark context initialization...")
val totalWaitTime = sparkConf.get(AM_MAX_WAIT_TIME)
try {
val sc = ThreadUtils.awaitResult(sparkContextPromise.future,
Duration(totalWaitTime, TimeUnit.MILLISECONDS))
if (sc != null) {
rpcEnv = sc.env.rpcEnv
val userConf = sc.getConf
val host = userConf.get("spark.driver.host")
val port = userConf.get("spark.driver.port").toInt
//注册ApplicationMaster
registerAM(host, port, userConf, sc.ui.map(_.webUrl))
val driverRef = rpcEnv.setupEndpointRef(
RpcAddress(host, port),
YarnSchedulerBackend.ENDPOINT_NAME)
createAllocator(driverRef, userConf)
} else {
// Sanity check; should never happen in normal operation, since sc should only be null
// if the user app did not create a SparkContext.
throw new IllegalStateException("User did not initialize spark context!")
}
resumeDriver()
userClassThread.join()
} catch {
case e: SparkException if e.getCause().isInstanceOf[TimeoutException] =>
logError(
s"SparkContext did not initialize after waiting for $totalWaitTime ms. " +
"Please check earlier log output for errors. Failing the application.")
finish(FinalApplicationStatus.FAILED,
ApplicationMaster.EXIT_SC_NOT_INITED,
"Timed out waiting for SparkContext.")
} finally {
resumeDriver()
}
}
driver注册ApplicationMaster
private val client = doAsUser { new YarnRMClient() }
private def registerAM(
host: String,
port: Int,
_sparkConf: SparkConf,
uiAddress: Option[String]): Unit = {
val appId = client.getAttemptId().getApplicationId().toString()
val attemptId = client.getAttemptId().getAttemptId().toString()
val historyAddress = ApplicationMaster
.getHistoryServerAddress(_sparkConf, yarnConf, appId, attemptId)
client.register(host, port, yarnConf, _sparkConf, uiAddress, historyAddress)
registered = true
}
YarnRMClient注册ApplicationMaster
/**
* Registers the application master with the RM.
*
* @param driverHost Host name where driver is running.
* @param driverPort Port where driver is listening.
* @param conf The Yarn configuration.
* @param sparkConf The Spark configuration.
* @param uiAddress Address of the SparkUI.
* @param uiHistoryAddress Address of the application on the History Server.
*/
def register(
driverHost: String,
driverPort: Int,
conf: YarnConfiguration,
sparkConf: SparkConf,
uiAddress: Option[String],
uiHistoryAddress: String): Unit = {
//amClient是AMRMClientImpl的实例
amClient = AMRMClient.createAMRMClient()
amClient.init(conf)
amClient.start()
this.uiHistoryAddress = uiHistoryAddress
val trackingUrl = uiAddress.getOrElse {
if (sparkConf.get(ALLOW_HISTORY_SERVER_TRACKING_URL)) uiHistoryAddress else ""
}
logInfo("Registering the ApplicationMaster")
synchronized {
amClient.registerApplicationMaster(driverHost, driverPort, trackingUrl)
registered = true
}
}
hadoop-yarn部分
AMRMClientImpl生成ApplicationMasterProtocol的实例对象 —— ApplicationMasterProtocolPBClientImpl
protected ApplicationMasterProtocol rmClient;
@Override
protected void serviceStart() throws Exception {
final YarnConfiguration conf = new YarnConfiguration(getConfig());
try {
if (rmClient == null) {
//获取conf中的yarn.resourcemanager.scheduler.address
//用以生成ApplicationMasterProtocolPBClientImpl的实例对象
rmClient = ClientRMProxy.createRMProxy(
conf, ApplicationMasterProtocol.class);
}
} catch (IOException e) {
throw new YarnRuntimeException(e);
}
super.serviceStart();
}
AMRMClientImpl根据hostName,hostPort,trackingUrl生成RegisterApplicationMasterRequest,并调用ApplicationMasterProtocolPBClientImpl注册ApplicationMaster
private RegisterApplicationMasterResponse registerApplicationMaster()
throws YarnException, IOException {
//根据hostName,hostPort,trackingUrl生成RegisterApplicationMasterRequest
RegisterApplicationMasterRequest request =
RegisterApplicationMasterRequest.newInstance(this.appHostName,
this.appHostPort, this.appTrackingUrl);
if (!this.placementConstraints.isEmpty()) {
request.setPlacementConstraints(this.placementConstraints);
}
//调用ApplicationMasterProtocolPBClientImpl注册ApplicationMaster
RegisterApplicationMasterResponse response =
rmClient.registerApplicationMaster(request);
synchronized (this) {
lastResponseId = 0;
if (!response.getNMTokensFromPreviousAttempts().isEmpty()) {
populateNMTokens(response.getNMTokensFromPreviousAttempts());
}
this.resourceProfilesMap = response.getResourceProfiles();
List<Container> prevContainers =
response.getContainersFromPreviousAttempts();
AMRMClientUtils.removeFromOutstandingSchedulingRequests(prevContainers,
this.outstandingSchedRequests);
}
return response;
}
ApplicationMasterProtocolPBClientImpl通过ProtobufRpcEngine发起rpc通信
略,此处可转到hadoop-common2.7源码分析之ProtobufRpcEngine(RPC实现)
最终会调用到ResourceManager端的ApplicationMasterService
ApplicationMasterService处理ApplicationMaster的注册
private final AMSProcessingChain amsProcessingChain;
@Override
public RegisterApplicationMasterResponse registerApplicationMaster(
RegisterApplicationMasterRequest request) throws YarnException,
IOException {
AMRMTokenIdentifier amrmTokenIdentifier =
YarnServerSecurityUtils.authorizeRequest();
ApplicationAttemptId applicationAttemptId =
amrmTokenIdentifier.getApplicationAttemptId();
ApplicationId appID = applicationAttemptId.getApplicationId();
AllocateResponseLock lock = responseMap.get(applicationAttemptId);
if (lock == null) {
RMAuditLogger.logFailure(this.rmContext.getRMApps().get(appID).getUser(),
AuditConstants.REGISTER_AM, "Application doesn't exist in cache "
+ applicationAttemptId, "ApplicationMasterService",
"Error in registering application master", appID,
applicationAttemptId);
throwApplicationDoesNotExistInCacheException(applicationAttemptId);
}
// Allow only one thread in AM to do registerApp at a time.
synchronized (lock) {
AllocateResponse lastResponse = lock.getAllocateResponse();
if (hasApplicationMasterRegistered(applicationAttemptId)) {
// allow UAM re-register if work preservation is enabled
ApplicationSubmissionContext appContext =
rmContext.getRMApps().get(appID).getApplicationSubmissionContext();
if (!(appContext.getUnmanagedAM()
&& appContext.getKeepContainersAcrossApplicationAttempts())) {
String message =
AMRMClientUtils.APP_ALREADY_REGISTERED_MESSAGE + appID;
LOG.warn(message);
RMAuditLogger.logFailure(
this.rmContext.getRMApps().get(appID).getUser(),
AuditConstants.REGISTER_AM, "", "ApplicationMasterService",
message, appID, applicationAttemptId);
throw new InvalidApplicationMasterRequestException(message);
}
}
this.amLivelinessMonitor.receivedPing(applicationAttemptId);
// Setting the response id to 0 to identify if the
// application master is register for the respective attemptid
lastResponse.setResponseId(0);
lock.setAllocateResponse(lastResponse);
RegisterApplicationMasterResponse response =
recordFactory.newRecordInstance(
RegisterApplicationMasterResponse.class);
this.amsProcessingChain.registerApplicationMaster(
amrmTokenIdentifier.getApplicationAttemptId(), request, response);
return response;
}
}
AMSProcessingChain通过责任链模式处理ApplicationMaster的注册
责任链上的processor的头节点目前是DefaultAMSProcessor
@Override
public void registerApplicationMaster(
ApplicationAttemptId applicationAttemptId,
RegisterApplicationMasterRequest request,
RegisterApplicationMasterResponse response)
throws IOException, YarnException {
RMApp app = getRmContext().getRMApps().get(
applicationAttemptId.getApplicationId());
LOG.info("AM registration " + applicationAttemptId);
getRmContext().getDispatcher().getEventHandler()
.handle(
new RMAppAttemptRegistrationEvent(applicationAttemptId, request
.getHost(), request.getRpcPort(), request.getTrackingUrl()));
RMAuditLogger.logSuccess(app.getUser(),
RMAuditLogger.AuditConstants.REGISTER_AM,
"ApplicationMasterService", app.getApplicationId(),
applicationAttemptId);
//设置最大可使用资源数量
response.setMaximumResourceCapability(getScheduler()
.getMaximumResourceCapability(app.getQueue()));
//设置资源可访问列表
response.setApplicationACLs(app.getRMAppAttempt(applicationAttemptId)
.getSubmissionContext().getAMContainerSpec().getApplicationACLs());
//设置队列
response.setQueue(app.getQueue());
if (UserGroupInformation.isSecurityEnabled()) {
LOG.info("Setting client token master key");
response.setClientToAMTokenMasterKey(java.nio.ByteBuffer.wrap(
getRmContext().getClientToAMTokenSecretManager()
.getMasterKey(applicationAttemptId).getEncoded()));
}
// For work-preserving AM restart, retrieve previous attempts' containers
// and corresponding NM tokens.
if (app.getApplicationSubmissionContext()
.getKeepContainersAcrossApplicationAttempts()) {
List<Container> transferredContainers = getScheduler()
.getTransferredContainers(applicationAttemptId);
if (!transferredContainers.isEmpty()) {
response.setContainersFromPreviousAttempts(transferredContainers);
// Clear the node set remembered by the secret manager. Necessary
// for UAM restart because we use the same attemptId.
rmContext.getNMTokenSecretManager()
.clearNodeSetForAttempt(applicationAttemptId);
List<NMToken> nmTokens = new ArrayList<NMToken>();
for (Container container : transferredContainers) {
try {
NMToken token = getRmContext().getNMTokenSecretManager()
.createAndGetNMToken(app.getUser(), applicationAttemptId,
container);
if (null != token) {
nmTokens.add(token);
}
} catch (IllegalArgumentException e) {
// if it's a DNS issue, throw UnknowHostException directly and
// that
// will be automatically retried by RMProxy in RPC layer.
if (e.getCause() instanceof UnknownHostException) {
throw (UnknownHostException) e.getCause();
}
}
}
response.setNMTokensFromPreviousAttempts(nmTokens);
LOG.info("Application " + app.getApplicationId() + " retrieved "
+ transferredContainers.size() + " containers from previous"
+ " attempts and " + nmTokens.size() + " NM tokens.");
}
}
response.setSchedulerResourceTypes(getScheduler()
.getSchedulingResourceTypes());
response.setResourceTypes(ResourceUtils.getResourcesTypeInfo());
if (getRmContext().getYarnConfiguration().getBoolean(
YarnConfiguration.RM_RESOURCE_PROFILES_ENABLED,
YarnConfiguration.DEFAULT_RM_RESOURCE_PROFILES_ENABLED)) {
response.setResourceProfiles(
resourceProfilesManager.getResourceProfiles());
}
}