上一篇作业的提交最终流转到RMAppManager手中,RM节点上的ClientRMService对象相当于接待站,而RMApp Manager对象则专门管理与作业的申请和运行相关。两个对象均由 ResourceManager 创建,都在同一个 JVM 上。ClientRMService 是 通 过 调 用 rmAppManager. submitApplication ()把 作 业 申 请 交 到RMAppManager 手里的。我们接着从这里切入。
1. 作业状态调度
hadoop-yarn-server-resourcemanager\src\main\java\org\apache\hadoop\yarn\server\resourcemanager\RMAppManager.java
protected void submitApplication(
ApplicationSubmissionContext submissionContext, long submitTime,
String user) throws YarnException {
ApplicationId applicationId = submissionContext.getApplicationId();
//首先创建一个RMAppImpl
RMAppImpl application =
createAndPopulateNewRMApp(submissionContext, submitTime, user, false);
//处理安全认证
Credentials credentials = null;
try {
credentials = parseCredentials(submissionContext);
if (UserGroupInformation.isSecurityEnabled()) {
this.rmContext.getDelegationTokenRenewer()
.addApplicationAsync(applicationId, credentials,
submissionContext.getCancelTokensWhenComplete(),
application.getUser());
} else {
// Dispatcher is not yet started at this time, so these START events
// enqueued should be guaranteed to be first processed when dispatcher
// gets started.
//驱动RMAppImpl的状态机开始运行
this.rmContext.getDispatcher().getEventHandler()
.handle(new RMAppEvent(applicationId, RMAppEventType.START));
}
} catch (Exception e) {
//出现异常,拒绝处理
assert application.getState() == RMAppState.NEW;
this.rmContext.getDispatcher().getEventHandler()
.handle(new RMAppEvent(applicationId,
RMAppEventType.APP_REJECTED, e.getMessage()));
throw RPCUtil.getRemoteException(e);
}
}
hadoop-yarn-server-resourcemanager\src\main\java\org\apache\hadoop\yarn\server\resourcemanager\ResourceManager.java
由ApplicationEventDispatcher 转发给RMAppImpl处理
public class ResourceManager extends CompositeService implements Recoverable {
private Dispatcher rmDispatcher; // AsyncDispatcher
//注册能接收RMAppEventType类型事件的Handler
rmDispatcher.register(RMAppEventType.class, new ApplicationEventDispatcher(rmContext));
//命名为Dispatcher实际为Handler
public static final class ApplicationEventDispatcher implements EventHandler<RMAppEvent> {
private final RMContext rmContext;
public ApplicationEventDispatcher(RMContext rmContext) {
this.rmContext = rmContext;
}
@Override
public void handle(RMAppEvent event) {
ApplicationId appID = event.getApplicationId();
RMApp rmApp = this.rmContext.getRMApps().get(appID);
if (rmApp != null) {
try {
rmApp.handle(event); //交给具体的RMAppImpl处理
} catch (Throwable t) {
......
}
}
}
}
}
hadoop-yarn-server-resourcemanager\src\main\java\org\apache\hadoop\yarn\server\resourcemanager\rmapp\RMAppImpl.java
事件RMAppEventType.START的实际处理者
public interface RMApp extends EventHandler<RMAppEvent> {}
public class RMAppImpl implements RMApp, Recoverable {
//状态机跳变规则 RMAppNewlySavingTransition 后下一状态 NEW_SAVING
.addTransition(RMAppState.NEW, RMAppState.NEW_SAVING,
RMAppEventType.START, new RMAppNewlySavingTransition())
@Override
public void handle(RMAppEvent event) {
this.writeLock.lock();
try {
ApplicationId appID = event.getApplicationId();
final RMAppState oldState = getState();
try {
//驱动状态机
this.stateMachine.doTransition(event.getType(), event);
} catch (InvalidStateTransitionException e) {
......
onInvalidStateTransition(event.getType(), oldState);
}
} finally {
this.writeLock.unlock();
}
}
//实务处理
private static final class RMAppNewlySavingTransition extends RMAppTransition {
@Override
public void transition(RMAppImpl app, RMAppEvent event) {
app.rmContext.getStateStore().storeNewApplication(app); //保存应用信息 RMStateStore
}
}
}
resourcemanager\src\main\java\org\apache\hadoop\yarn\server\resourcemanager\recovery\RMStateStore.java
//保存作业状态信息, 派生类有 FileSystemRMStateStore 、 MemoryRMStateStore 、 NullRMStateStore 以及 ZKRMStateStore
public abstract class RMStateStore extends AbstractService {
public void storeNewApplication(RMApp app) {
ApplicationSubmissionContext context = app.getApplicationSubmissionContext();
assert context instanceof ApplicationSubmissionContextPBImpl;
ApplicationStateData appState =
ApplicationStateData.newInstance(app.getSubmitTime(),
app.getStartTime(), context, app.getUser(), app.getCallerContext());
//事件类型为 RMStateStoreEventType.STORE _ APP
dispatcher.getEventHandler().handle(new RMStateStoreAppEvent(appState));
}
}
对应的事务处理
private static class StoreAppTransition
implements MultipleArcTransition<RMStateStore, RMStateStoreEvent,
RMStateStoreState> {
@Override
public RMStateStoreState transition(RMStateStore store,
RMStateStoreEvent event) {
boolean isFenced = false;
ApplicationStateData appState = ((RMStateStoreAppEvent) event).getAppState();
ApplicationId appId = appState.getApplicationSubmissionContext().getApplicationId();
try {
store.storeApplicationStateInternal(appId, appState);
//
store.notifyApplication(new RMAppEvent(appId, RMAppEventType.APP_NEW_SAVED));
}
return finalState(isFenced);
};
}
hadoop-yarn-server-resourcemanager\src\main\java\org\apache\hadoop\yarn\server\resourcemanager\rmapp\RMAppImpl.java
APP_NEW_SAVED将驱动当前 NEW _ SAVING 变为 SUBMITTED
addTransition ( RMAppState.NEW _ SAVING , RMAppState.SUBMITTED ,
RMAppEventType.APP _ NEW _ SAVED , newAddApplicationToSchedulerTransition ())
private static final class AddApplicationToSchedulerTransition extends
RMAppTransition {
@Override
public void transition(RMAppImpl app, RMAppEvent event) {
//发送调度事件
app.handler.handle(new AppAddedSchedulerEvent(app.user,
app.submissionContext, false));
// send the ATS create Event
app.sendATSCreateEvent();
}
}
AppAddedSchedulerEvent事件的事件类型
public class AppAddedSchedulerEvent extends SchedulerEvent {
public AppAddedSchedulerEvent(ApplicationId applicationId, String queue,
String user, boolean isAppRecovering, ReservationId reservationID,
Priority appPriority) {
super(SchedulerEventType.APP_ADDED); //事件类型
this.applicationId = applicationId;
this.queue = queue;
this.user = user;
this.reservationID = reservationID;
this.isAppRecovering = isAppRecovering;
this.appPriority = appPriority;
}
}
resourcemanager\src\main\java\org\apache\hadoop\yarn\server\resourcemanager\ResourceManager.java
调度转发器 SchedulerEventDispatcher
@Private
public static class SchedulerEventDispatcher extends AbstractService
implements EventHandler<SchedulerEvent> {
//资源调度
private final ResourceScheduler scheduler;
//调度队列
private final BlockingQueue<SchedulerEvent> eventQueue = new LinkedBlockingQueue<SchedulerEvent>();
private volatile int lastEventQueueSizeLogged = 0;
//处理线程
private final Thread eventProcessor;
private volatile boolean stopped = false;
private boolean shouldExitOnError = false;
public SchedulerEventDispatcher(ResourceScheduler scheduler) {
super(SchedulerEventDispatcher.class.getName());
this.scheduler = scheduler;
this.eventProcessor = new Thread(new EventProcessor());
this.eventProcessor.setName("ResourceManager Event Processor");
}
@Override
protected void serviceInit(Configuration conf) throws Exception {
this.shouldExitOnError =
conf.getBoolean(Dispatcher.DISPATCHER_EXIT_ON_ERROR_KEY,
Dispatcher.DEFAULT_DISPATCHER_EXIT_ON_ERROR);
super.serviceInit(conf);
}
@Override
protected void serviceStart() throws Exception {
this.eventProcessor.start();
super.serviceStart();
}
private final class EventProcessor implements Runnable {
@Override
public void run() {
SchedulerEvent event;
while (!stopped && !Thread.currentThread().isInterrupted()) {
try {
event = eventQueue.take(); //从队列中取出一个事件
} catch (InterruptedException e) {
LOG.error("Returning, interrupted : " + e);
return; // TODO: Kill RM.
}
try {
scheduler.handle(event); //调度处理
} catch (Throwable t) {
......
}
}
}
}
@Override
protected void serviceStop() throws Exception {
this.stopped = true;
this.eventProcessor.interrupt();
try {
this.eventProcessor.join();
} catch (InterruptedException e) {
throw new YarnRuntimeException(e);
}
super.serviceStop();
}
@Override
public void handle(SchedulerEvent event) {
try {
int qSize = eventQueue.size();
if (qSize != 0 && qSize % 1000 == 0
&& lastEventQueueSizeLogged != qSize) {
lastEventQueueSizeLogged = qSize;
LOG.info("Size of scheduler event-queue is " + qSize);
}
int remCapacity = eventQueue.remainingCapacity();
if (remCapacity < 1000) {
LOG.info("Very low remaining capacity on scheduler event queue: "
+ remCapacity);
}
this.eventQueue.put(event); //挂入队列
} catch (InterruptedException e) {
LOG.info("Interrupted. Trying to exit gracefully.");
}
}
}
以FifoScheduler为例,来考察
public class FifoScheduler extends AbstractYarnScheduler<FiCaSchedulerApp, FiCaSchedulerNode> implements Configurable {
@Override
public void handle(SchedulerEvent event) {
switch(event.getType()) {
case NODE_ADDED:
......
break;
case NODE_REMOVED:
......
break;
case NODE_RESOURCE_UPDATE:
......
break;
case NODE_UPDATE:
......
break;
case APP_ADDED:
{
AppAddedSchedulerEvent appAddedEvent = (AppAddedSchedulerEvent) event;
addApplication(appAddedEvent.getApplicationId(), appAddedEvent.getQueue(), appAddedEvent.getUser(),
appAddedEvent.getIsAppRecovering());
}
break;
}
@VisibleForTesting
public synchronized void addApplication(ApplicationId applicationId,
String queue, String user, boolean isAppRecovering) {
SchedulerApplication<FiCaSchedulerApp> application = new SchedulerApplication<FiCaSchedulerApp>(DEFAULT_QUEUE, user);
applications.put(applicationId, application);
metrics.submitApp(user);
if (isAppRecovering) {
......
} else {
rmContext.getDispatcher().getEventHandler()
.handle(new RMAppEvent(applicationId, RMAppEventType.APP_ACCEPTED));
}
}
2. 作业发起运行调度
resourcemanager\src\main\java\org\apache\hadoop\yarn\server\resourcemanager\rmapp\RMAppImpl.java
.addTransition(RMAppState.SUBMITTED, RMAppState.ACCEPTED,
RMAppEventType.APP_ACCEPTED, new StartAppAttemptTransition())
//开始一次启动运行的尝试
private static final class StartAppAttemptTransition extends RMAppTransition {
@Override
public void transition(RMAppImpl app, RMAppEvent event) {
app.createAndStartNewAttempt(false);
};
}
//发起事件
private void createAndStartNewAttempt(boolean transferStateFromPreviousAttempt) {
//创建一个新的 RMAppAttemptImpl ,并将其设置成 currentAttempt
createNewAttempt();
handler.handle(new RMAppStartAttemptEvent(currentAttempt.getAppAttemptId(),
transferStateFromPreviousAttempt));
}
resourcemanager\src\main\java\org\apache\hadoop\yarn\server\resourcemanager\ResourceManager.java
Attemp事件转发
@Private
public static final class ApplicationAttemptEventDispatcher implements
EventHandler<RMAppAttemptEvent> {
private final RMContext rmContext;
public ApplicationAttemptEventDispatcher(RMContext rmContext) {
this.rmContext = rmContext;
}
@Override
public void handle(RMAppAttemptEvent event) {
ApplicationAttemptId appAttemptID = event.getApplicationAttemptId();
ApplicationId appAttemptId = appAttemptID.getApplicationId();
RMApp rmApp = this.rmContext.getRMApps().get(appAttemptId);
if (rmApp != null) {
RMAppAttempt rmAppAttempt = rmApp.getRMAppAttempt(appAttemptID);
if (rmAppAttempt != null) {
try {
rmAppAttempt.handle(event);
} catch (Throwable t) {
......
}
}
}
}
}
驱动状态机
public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
//跳变规则
addTransition ( RMAppAttemptState.NEW , RMAppAttemptState.SUBMITTED ,
RMAppAttemptEventType. START , newAttemptStartedTransition ())
public void handle(RMAppAttemptEvent event) {
this.stateMachine.doTransition(event.getType(), event);
}
}
尝试事件调度
private static final class AttemptStartedTransition extends BaseTransition {
@Override
public void transition(RMAppAttemptImpl appAttempt,
RMAppAttemptEvent event) {
//向 ApplicationMasterService 登记一次 RMAppAttempt
// Register with the ApplicationMasterService
appAttempt.masterService
.registerAppAttempt(appAttempt.applicationAttemptId);
if (UserGroupInformation.isSecurityEnabled()) {
appAttempt.clientTokenMasterKey =
appAttempt.rmContext.getClientToAMTokenSecretManager()
.createMasterKey(appAttempt.applicationAttemptId);
}
// Add the applicationAttempt to the scheduler and inform the scheduler
// whether to transfer the state from previous attempt.
//事件类型 SchedulerEventType.APP _ ATTEMPT _ ADDED,由 FifoScheduler处理
appAttempt.eventHandler.handle(new AppAttemptAddedSchedulerEvent(
appAttempt.applicationAttemptId, transferStateFromPreviousAttempt));
}
}
resourcemanager\src\main\java\org\apache\hadoop\yarn\server\resourcemanager\scheduler\fifo\FifoScheduler.java
public synchronized void
addApplicationAttempt(ApplicationAttemptId appAttemptId,
boolean transferStateFromPreviousAttempt,
boolean isAttemptRecovering) {
SchedulerApplication<FiCaSchedulerApp> application =
applications.get(appAttemptId.getApplicationId());
String user = application.getUser();
// TODO: Fix store
FiCaSchedulerApp schedulerApp =
new FiCaSchedulerApp(appAttemptId, user, DEFAULT_QUEUE,
activeUsersManager, this.rmContext);
if (transferStateFromPreviousAttempt) {
schedulerApp.transferStateFromPreviousAttempt(application
.getCurrentAppAttempt());
}
application.setCurrentAppAttempt(schedulerApp);
metrics.submitAppAttempt(user);
} else {
//新加入的应用
rmContext.getDispatcher().getEventHandler().handle(
new RMAppAttemptEvent(appAttemptId,
RMAppAttemptEventType.ATTEMPT_ADDED));
}
}
hadoop\yarn\server\resourcemanager\rmapp\attempt\RMAppAttemptImpl.java
//跳变规则
.addTransition(RMAppAttemptState.SUBMITTED,
EnumSet.of(RMAppAttemptState.LAUNCHED_UNMANAGED_SAVING,
RMAppAttemptState.SCHEDULED),
RMAppAttemptEventType.ATTEMPT_ADDED,
new ScheduleTransition())
public static final class ScheduleTransition implements
MultipleArcTransition<RMAppAttemptImpl, RMAppAttemptEvent, RMAppAttemptState> {
@Override
public RMAppAttemptState transition(RMAppAttemptImpl appAttempt,
RMAppAttemptEvent event) {
ApplicationSubmissionContext subCtx = appAttempt.submissionContext;
if (!subCtx.getUnmanagedAM()) { //如果不是 UnmanagedAM ,就得为其创建 AM
appAttempt.amReq.setNumContainers(1);// 只要求一个容器,用于创建 AM
appAttempt.amReq.setPriority(AM_CONTAINER_PRIORITY);
appAttempt.amReq.setResourceName(ResourceRequest.ANY);
appAttempt.amReq.setRelaxLocality(true);
appAttempt.getAMBlacklistManager().refreshNodeHostCount(
appAttempt.scheduler.getNumClusterNodes());
ResourceBlacklistRequest amBlacklist =
appAttempt.getAMBlacklistManager().getBlacklistUpdates();
if (LOG.isDebugEnabled()) {
LOG.debug("Using blacklist for AM: additions(" +
amBlacklist.getBlacklistAdditions() + ") and removals(" +
amBlacklist.getBlacklistRemovals() + ")");
}
// 分配资源
Allocation amContainerAllocation =
appAttempt.scheduler.allocate(
appAttempt.applicationAttemptId,
Collections.singletonList(appAttempt.amReq),
EMPTY_CONTAINER_RELEASE_LIST,
amBlacklist.getBlacklistAdditions(),
amBlacklist.getBlacklistRemovals(), null, null);
if (amContainerAllocation != null
&& amContainerAllocation.getContainers() != null) {
assert (amContainerAllocation.getContainers().size() == 0);
}
return RMAppAttemptState.SCHEDULED; // RMAppAttemptImpl 状态机的新状态
} else {
// save state and then go to LAUNCHED state
appAttempt.storeAttempt();
return RMAppAttemptState.LAUNCHED_UNMANAGED_SAVING;
}
}
}
hadoop\yarn\server\resourcemanager\scheduler\fifo\FifoScheduler.java
开始分配容器
@Override
public Allocation allocate(ApplicationAttemptId applicationAttemptId,
List<ResourceRequest> ask, List<ContainerId> release,
List<String> blacklistAdditions, List<String> blacklistRemovals,
List<UpdateContainerRequest> increaseRequests,
List<UpdateContainerRequest> decreaseRequests) {
//代表着要求分配资源的 AppAttempt
FiCaSchedulerApp application = getApplicationAttempt(applicationAttemptId);
// 资源要求的合理性检测和规格化
SchedulerUtils.normalizeRequests(ask, resourceCalculator,
clusterResource, minimumAllocation, getMaximumResourceCapability());
// 释放该释放的容器
releaseContainers(release, application);
synchronized (application) {
if (!ask.isEmpty()) { // 要求分配的资源集合非空
application.showRequests();
// Update application requests
application.updateResourceRequests(ask);
application.showRequests();
}
//更新黑名单
application.updateBlacklist(blacklistAdditions, blacklistRemovals);
Resource headroom = application.getHeadroom();
application.setApplicationHeadroomForMetrics(headroom);
//所返回的 ContainersAndNMTokensAllocation 对象成为前面的 allocation
return new Allocation(application.pullNewlyAllocatedContainers(),
headroom, null, null, null, application.pullUpdatedNMTokens());
}
}
RM管理着事件的转发,本片中主要转发RMAppImpl,RMAppAttemptImpl,FifoScheduler, RMStateStore几者之间的状态机事件转发。最终走到FifoScheduler的容器分配。关于容器的分配是一个大主题,下次再考察。