ResourceTrackerService 定义了资源的交互
service ResourceTrackerService {
rpc registerNodeManager(RegisterNodeManagerRequestProto) returns (RegisterNodeManagerResponseProto);
rpc nodeHeartbeat(NodeHeartbeatRequestProto) returns (NodeHeartbeatResponseProto);
rpc unRegisterNodeManager(UnRegisterNodeManagerRequestProto) returns (UnRegisterNodeManagerResponseProto);
}
ResourceTrackerService#registerNodeManager
public RegisterNodeManagerResponse registerNodeManager(
RegisterNodeManagerRequest request) throws YarnException,
NodeId nodeId = request.getNodeId();
String host = nodeId.getHost();
int cmPort = nodeId.getPort();
int httpPort = request.getHttpPort();
Resource capability = request.getResource();
String nodeManagerVersion = request.getNMVersion();
Resource physicalResource = request.getPhysicalResource();
RMNode rmNode = new RMNodeImpl(nodeId, rmContext, host, cmPort, httpPort,
resolve(host), capability, nodeManagerVersion, physicalResource);
RMNode oldNode = this.rmContext.getRMNodes().putIfAbsent(nodeId, rmNode);
if (oldNode == null) {
RMNodeStartedEvent startEvent = new RMNodeStartedEvent(nodeId,
request.getNMContainerStatuses(),
request.getRunningApplications());
if (request.getLogAggregationReportsForApps() != null
&& !request.getLogAggregationReportsForApps().isEmpty()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Found the number of previous cached log aggregation "
+ "status from nodemanager:" + nodeId + " is :"
+ request.getLogAggregationReportsForApps().size());
}
startEvent.setLogAggregationReportsForApps(request
.getLogAggregationReportsForApps());
}
this.rmContext.getDispatcher().getEventHandler().handle(
startEvent);
} else {
LOG.info("Reconnect from the node at: " + host);
this.nmLivelinessMonitor.unregister(nodeId);
if (CollectionUtils.isEmpty(request.getRunningApplications())
&& rmNode.getState() != NodeState.DECOMMISSIONING
&& rmNode.getHttpPort() != oldNode.getHttpPort()) {
// Reconnected node differs, so replace old node and start new node
switch (rmNode.getState()) {
case RUNNING:
ClusterMetrics.getMetrics().decrNumActiveNodes();
break;
case UNHEALTHY:
ClusterMetrics.getMetrics().decrNumUnhealthyNMs();
break;
default:
LOG.debug("Unexpected Rmnode state");
}
this.rmContext.getDispatcher().getEventHandler()
.handle(new NodeRemovedSchedulerEvent(rmNode));
this.rmContext.getRMNodes().put(nodeId, rmNode);
this.rmContext.getDispatcher().getEventHandler()
.handle(new RMNodeStartedEvent(nodeId, null, null));
} else {
// Reset heartbeat ID since node just restarted.
oldNode.resetLastNodeHeartBeatResponse();
this.rmContext.getDispatcher().getEventHandler()
.handle(new RMNodeReconnectEvent(nodeId, rmNode,
request.getRunningApplications(),
request.getNMContainerStatuses()));
}
}
// On every node manager register we will be clearing NMToken keys if
// present for any running application.
this.nmTokenSecretManager.removeNodeKey(nodeId);
this.nmLivelinessMonitor.register(nodeId);
// Handle received container status, this should be processed after new
// RMNode inserted
if (!rmContext.isWorkPreservingRecoveryEnabled()) {
if (!request.getNMContainerStatuses().isEmpty()) {
LOG.info("received container statuses on node manager register :"
+ request.getNMContainerStatuses());
for (NMContainerStatus status : request.getNMContainerStatuses()) {
handleNMContainerStatus(status, nodeId);
}
}
}
// Update node's labels to RM's NodeLabelManager.
Set<String> nodeLabels = NodeLabelsUtils.convertToStringSet(
request.getNodeLabels());
if (isDistributedNodeLabelsConf && nodeLabels != null) {
try {
updateNodeLabelsFromNMReport(nodeLabels, nodeId);
response.setAreNodeLabelsAcceptedByRM(true);
} catch (IOException ex) {
// Ensure the exception is captured in the response
response.setDiagnosticsMessage(ex.getMessage());
response.setAreNodeLabelsAcceptedByRM(false);
}
} else if (isDelegatedCentralizedNodeLabelsConf) {
this.rmContext.getRMDelegatedNodeLabelsUpdater().updateNodeLabels(nodeId);
}
AddNodeTransition
当 node 第一次注册时,把 node 放到 rmContext 里,并触发事件 RMNodeStartedEvent
RMNodeStartedEvent startEvent = new RMNodeStartedEvent(nodeId,
request.getNMContainerStatuses(),
request.getRunningApplications());
RMNodeStartedEvent 事件的处理方法是 RMNodeImpl 的静态内部类 AddNodeTransition。
public static class AddNodeTransition implements
SingleArcTransition<RMNodeImpl, RMNodeEvent> {
@Override
public void transition(RMNodeImpl rmNode, RMNodeEvent event) {
// Inform the scheduler
RMNodeStartedEvent startEvent = (RMNodeStartedEvent) event;
List<NMContainerStatus> containers = null;
NodeId nodeId = rmNode.nodeId;
RMNode previousRMNode =
rmNode.context.getInactiveRMNodes().remove(nodeId);
if (previousRMNode != null) {
rmNode.updateMetricsForRejoinedNode(previousRMNode.getState());
} else {
NodeId unknownNodeId =
NodesListManager.createUnknownNodeId(nodeId.getHost());
previousRMNode =
rmNode.context.getInactiveRMNodes().remove(unknownNodeId);
if (previousRMNode != null) {
ClusterMetrics.getMetrics().decrDecommisionedNMs();
}
// Increment activeNodes explicitly because this is a new node.
ClusterMetrics.getMetrics().incrNumActiveNodes();
containers = startEvent.getNMContainerStatuses();
if (containers != null && !containers.isEmpty()) {
for (NMContainerStatus container : containers) {
if (container.getContainerState() == ContainerState.RUNNING) {
rmNode.launchedContainers.add(container.getContainerId());
}
}
}
}
if (null != startEvent.getRunningApplications()) {
for (ApplicationId appId : startEvent.getRunningApplications()) {
handleRunningAppOnNode(rmNode, rmNode.context, appId, rmNode.nodeId);
}
}
rmNode.context.getDispatcher().getEventHandler()
.handle(new NodeAddedSchedulerEvent(rmNode, containers));
rmNode.context.getDispatcher().getEventHandler().handle(
new NodesListManagerEvent(
NodesListManagerEventType.NODE_USABLE, rmNode));
List<LogAggregationReport> logAggregationReportsForApps =
startEvent.getLogAggregationReportsForApps();
if (logAggregationReportsForApps != null
&& !logAggregationReportsForApps.isEmpty()) {
rmNode.handleLogAggregationStatus(logAggregationReportsForApps);
}
}
}
CapacityScheduler
如果当前调度器是 CapacityScheduler,CapacityScheduler 本身是一个 handler。
public void handle(SchedulerEvent event) {
switch(event.getType()) {
case NODE_ADDED:
{
NodeAddedSchedulerEvent nodeAddedEvent = (NodeAddedSchedulerEvent)event;
addNode(nodeAddedEvent.getAddedRMNode());
recoverContainersOnNode(nodeAddedEvent.getContainerReports(),
nodeAddedEvent.getAddedRMNode());
}
break;
// Omit other lines
}
}
addNode
private void addNode(RMNode nodeManager) {
try {
writeLock.lock();
FiCaSchedulerNode schedulerNode = new FiCaSchedulerNode(nodeManager,
usePortForNodeName, nodeManager.getNodeLabels());
nodeTracker.addNode(schedulerNode);
// update this node to node label manager
if (labelManager != null) {
labelManager.activateNode(nodeManager.getNodeID(),
schedulerNode.getTotalResource());
}
Resource clusterResource = getClusterResource();
getRootQueue().updateClusterResource(clusterResource,
new ResourceLimits(clusterResource));
LOG.info(
"Added node " + nodeManager.getNodeAddress() + " clusterResource: "
+ clusterResource);
if (scheduleAsynchronously && getNumClusterNodes() == 1) {
for (AsyncScheduleThread t : asyncSchedulerThreads) {
t.beginSchedule();
}
}
} finally {
writeLock.unlock();
}
}
Heart beat 的流程
ResourceTrackerService#nodeHeartbeat
@Override
public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request)
throws YarnException, IOException {
NodeStatus remoteNodeStatus = request.getNodeStatus();
/**
* Here is the node heartbeat sequence...
* 1. Check if it's a valid (i.e. not excluded) node
* 2. Check if it's a registered node
* 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
* 4. Send healthStatus to RMNode
* 5. Update node's labels if distributed Node Labels configuration is enabled
*/
NodeId nodeId = remoteNodeStatus.getNodeId();
// 1. Check if it's a valid (i.e. not excluded) node, if not, see if it is
// in decommissioning.
if (!this.nodesListManager.isValidNode(nodeId.getHost())
&& !isNodeInDecommissioning(nodeId)) {
String message =
"Disallowed NodeManager nodeId: " + nodeId + " hostname: "
+ nodeId.getHost();
LOG.info(message);
return YarnServerBuilderUtils.newNodeHeartbeatResponse(
NodeAction.SHUTDOWN, message);
}
// 2. Check if it's a registered node
RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
if (rmNode == null) {
/* node does not exist */
String message = "Node not found resyncing " + remoteNodeStatus.getNodeId();
LOG.info(message);
return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.RESYNC,
message);
}
// Send ping
this.nmLivelinessMonitor.receivedPing(nodeId);
this.decommissioningWatcher.update(rmNode, remoteNodeStatus);
// 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
NodeHeartbeatResponse lastNodeHeartbeatResponse = rmNode.getLastNodeHeartBeatResponse();
if (getNextResponseId(
remoteNodeStatus.getResponseId()) == lastNodeHeartbeatResponse
.getResponseId()) {
LOG.info("Received duplicate heartbeat from node "
+ rmNode.getNodeAddress()+ " responseId=" + remoteNodeStatus.getResponseId());
return lastNodeHeartbeatResponse;
} else if (remoteNodeStatus.getResponseId() != lastNodeHeartbeatResponse
.getResponseId()) {
String message =
"Too far behind rm response id:"
+ lastNodeHeartbeatResponse.getResponseId() + " nm response id:"
+ remoteNodeStatus.getResponseId();
LOG.info(message);
// TODO: Just sending reboot is not enough. Think more.
this.rmContext.getDispatcher().getEventHandler().handle(
new RMNodeEvent(nodeId, RMNodeEventType.REBOOTING));
return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.RESYNC,
message);
}
// Evaluate whether a DECOMMISSIONING node is ready to be DECOMMISSIONED.
if (rmNode.getState() == NodeState.DECOMMISSIONING &&
decommissioningWatcher.checkReadyToBeDecommissioned(
rmNode.getNodeID())) {
String message = "DECOMMISSIONING " + nodeId +
" is ready to be decommissioned";
LOG.info(message);
this.rmContext.getDispatcher().getEventHandler().handle(
new RMNodeEvent(nodeId, RMNodeEventType.DECOMMISSION));
this.nmLivelinessMonitor.unregister(nodeId);
return YarnServerBuilderUtils.newNodeHeartbeatResponse(
NodeAction.SHUTDOWN, message);
}
boolean timelineV2Enabled =
YarnConfiguration.timelineServiceV2Enabled(getConfig());
if (timelineV2Enabled) {
// Check & update collectors info from request.
updateAppCollectorsMap(request);
}
// Heartbeat response
NodeHeartbeatResponse nodeHeartBeatResponse =
YarnServerBuilderUtils.newNodeHeartbeatResponse(
getNextResponseId(lastNodeHeartbeatResponse.getResponseId()),
NodeAction.NORMAL, null, null, null, null, nextHeartBeatInterval);
rmNode.setAndUpdateNodeHeartbeatResponse(nodeHeartBeatResponse);
populateKeys(request, nodeHeartBeatResponse);
ConcurrentMap<ApplicationId, ByteBuffer> systemCredentials =
rmContext.getSystemCredentialsForApps();
if (!systemCredentials.isEmpty()) {
nodeHeartBeatResponse.setSystemCredentialsForApps(systemCredentials);
}
if (timelineV2Enabled) {
// Return collectors' map that NM needs to know
setAppCollectorsMapToResponse(rmNode.getRunningApps(),
nodeHeartBeatResponse);
}
// 4. Send status to RMNode, saving the latest response.
RMNodeStatusEvent nodeStatusEvent =
new RMNodeStatusEvent(nodeId, remoteNodeStatus);
if (request.getLogAggregationReportsForApps() != null
&& !request.getLogAggregationReportsForApps().isEmpty()) {
nodeStatusEvent.setLogAggregationReportsForApps(request
.getLogAggregationReportsForApps());
}
this.rmContext.getDispatcher().getEventHandler().handle(nodeStatusEvent);
// 5. Update node's labels to RM's NodeLabelManager.
if (isDistributedNodeLabelsConf && request.getNodeLabels() != null) {
try {
updateNodeLabelsFromNMReport(
NodeLabelsUtils.convertToStringSet(request.getNodeLabels()),
nodeId);
nodeHeartBeatResponse.setAreNodeLabelsAcceptedByRM(true);
} catch (IOException ex) {
//ensure the error message is captured and sent across in response
nodeHeartBeatResponse.setDiagnosticsMessage(ex.getMessage());
nodeHeartBeatResponse.setAreNodeLabelsAcceptedByRM(false);
}
}
// 6. check if node's capacity is load from dynamic-resources.xml
// if so, send updated resource back to NM.
String nid = nodeId.toString();
Resource capability = loadNodeResourceFromDRConfiguration(nid);
// sync back with new resource if not null.
if (capability != null) {
nodeHeartBeatResponse.setResource(capability);
}
// 7. Send Container Queuing Limits back to the Node. This will be used by
// the node to truncate the number of Containers queued for execution.
if (this.rmContext.getNodeManagerQueueLimitCalculator() != null) {
nodeHeartBeatResponse.setContainerQueuingLimit(
this.rmContext.getNodeManagerQueueLimitCalculator()
.createContainerQueuingLimit());
}
return nodeHeartBeatResponse;
}
RMNodeImpl
@Override
public void setAndUpdateNodeHeartbeatResponse(
NodeHeartbeatResponse response) {
this.writeLock.lock();
try {
response.addAllContainersToCleanup(
new ArrayList<ContainerId>(this.containersToClean));
response.addAllApplicationsToCleanup(this.finishedApplications);
response.addContainersToBeRemovedFromNM(
new ArrayList<ContainerId>(this.containersToBeRemovedFromNM));
response.addAllContainersToSignal(this.containersToSignal);
this.completedContainers.removeAll(this.containersToBeRemovedFromNM);
this.containersToClean.clear();
this.finishedApplications.clear();
this.containersToSignal.clear();
this.containersToBeRemovedFromNM.clear();
response.addAllContainersToUpdate(toBeUpdatedContainers.values());
toBeUpdatedContainers.clear();
// NOTE: This is required for backward compatibility.
response.addAllContainersToDecrease(toBeDecreasedContainers.values());
toBeDecreasedContainers.clear();
// Synchronously update the last response in rmNode with updated
// responseId
this.latestNodeHeartBeatResponse = response;
} finally {
this.writeLock.unlock();
}
};
StatusUpdateWhenHealthyTransition 处理 RMNodeStatusEvent
/**
* Status update transition when node is healthy.
*/
public static class StatusUpdateWhenHealthyTransition implements
MultipleArcTransition<RMNodeImpl, RMNodeEvent, NodeState> {
@Override
public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) {
RMNodeStatusEvent statusEvent = (RMNodeStatusEvent) event;
rmNode.setOpportunisticContainersStatus(
statusEvent.getOpportunisticContainersStatus());
NodeHealthStatus remoteNodeHealthStatus = updateRMNodeFromStatusEvents(
rmNode, statusEvent);
NodeState initialState = rmNode.getState();
boolean isNodeDecommissioning =
initialState.equals(NodeState.DECOMMISSIONING);
if (isNodeDecommissioning) {
List<ApplicationId> keepAliveApps = statusEvent.getKeepAliveAppIds();
if (rmNode.runningApplications.isEmpty() &&
(keepAliveApps == null || keepAliveApps.isEmpty())) {
RMNodeImpl.deactivateNode(rmNode, NodeState.DECOMMISSIONED);
return NodeState.DECOMMISSIONED;
}
}
if (!remoteNodeHealthStatus.getIsNodeHealthy()) {
LOG.info("Node " + rmNode.nodeId +
" reported UNHEALTHY with details: " +
remoteNodeHealthStatus.getHealthReport());
// if a node in decommissioning receives an unhealthy report,
// it will stay in decommissioning.
if (isNodeDecommissioning) {
return NodeState.DECOMMISSIONING;
} else {
reportNodeUnusable(rmNode, NodeState.UNHEALTHY);
return NodeState.UNHEALTHY;
}
}
rmNode.handleContainerStatus(statusEvent.getContainers());
rmNode.handleReportedIncreasedContainers(
statusEvent.getNMReportedIncreasedContainers());
List<LogAggregationReport> logAggregationReportsForApps =
statusEvent.getLogAggregationReportsForApps();
if (logAggregationReportsForApps != null
&& !logAggregationReportsForApps.isEmpty()) {
rmNode.handleLogAggregationStatus(logAggregationReportsForApps);
}
if(rmNode.nextHeartBeat) {
rmNode.nextHeartBeat = false;
rmNode.context.getDispatcher().getEventHandler().handle(
new NodeUpdateSchedulerEvent(rmNode));
}
// Update DTRenewer in secure mode to keep these apps alive. Today this is
// needed for log-aggregation to finish long after the apps are gone.
if (UserGroupInformation.isSecurityEnabled()) {
rmNode.context.getDelegationTokenRenewer().updateKeepAliveApplications(
statusEvent.getKeepAliveAppIds());
}
return initialState;
}
}
private void handleContainerStatus(List<ContainerStatus> containerStatuses) {
// Filter the map to only obtain just launched containers and finished
// containers.
List<ContainerStatus> newlyLaunchedContainers =
new ArrayList<ContainerStatus>();
List<ContainerStatus> newlyCompletedContainers =
new ArrayList<ContainerStatus>();
int numRemoteRunningContainers = 0;
for (ContainerStatus remoteContainer : containerStatuses) {
ContainerId containerId = remoteContainer.getContainerId();
// Don't bother with containers already scheduled for cleanup, or for
// applications already killed. The scheduler doens't need to know any
// more about this container
if (containersToClean.contains(containerId)) {
LOG.info("Container " + containerId + " already scheduled for "
+ "cleanup, no further processing");
continue;
}
ApplicationId containerAppId =
containerId.getApplicationAttemptId().getApplicationId();
if (finishedApplications.contains(containerAppId)) {
LOG.info("Container " + containerId
+ " belongs to an application that is already killed,"
+ " no further processing");
continue;
} else if (!runningApplications.contains(containerAppId)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Container " + containerId
+ " is the first container get launched for application "
+ containerAppId);
}
handleRunningAppOnNode(this, context, containerAppId, nodeId);
}
// Process running containers
if (remoteContainer.getState() == ContainerState.RUNNING) {
++numRemoteRunningContainers;
if (!launchedContainers.contains(containerId)) {
// Just launched container. RM knows about it the first time.
launchedContainers.add(containerId);
newlyLaunchedContainers.add(remoteContainer);
// Unregister from containerAllocationExpirer.
containerAllocationExpirer
.unregister(new AllocationExpirationInfo(containerId));
}
} else {
// A finished container
launchedContainers.remove(containerId);
if (completedContainers.add(containerId)) {
newlyCompletedContainers.add(remoteContainer);
}
// Unregister from containerAllocationExpirer.
containerAllocationExpirer
.unregister(new AllocationExpirationInfo(containerId));
}
}
List<ContainerStatus> lostContainers =
findLostContainers(numRemoteRunningContainers, containerStatuses);
for (ContainerStatus remoteContainer : lostContainers) {
ContainerId containerId = remoteContainer.getContainerId();
if (completedContainers.add(containerId)) {
newlyCompletedContainers.add(remoteContainer);
}
}
if (newlyLaunchedContainers.size() != 0
|| newlyCompletedContainers.size() != 0) {
nodeUpdateQueue.add(new UpdatedContainerInfo(newlyLaunchedContainers,
newlyCompletedContainers));
}
}
if(rmNode.nextHeartBeat) {
rmNode.nextHeartBeat = false;
rmNode.context.getDispatcher().getEventHandler().handle(
new NodeUpdateSchedulerEvent(rmNode));
}