存储类的属性
private static final String CONTAINERS_KEY_PREFIX =
"ContainerManager/containers/";
private static final String CONTAINER_REQUEST_KEY_SUFFIX = "/request";
private static final String CONTAINER_DIAGS_KEY_SUFFIX = "/diagnostics";
private static final String CONTAINER_LAUNCHED_KEY_SUFFIX = "/launched";
private static final String CONTAINER_KILLED_KEY_SUFFIX = "/killed";
private static final String CONTAINER_EXIT_CODE_KEY_SUFFIX = "/exitcode";
private static final String NM_TOKENS_KEY_PREFIX = "NMTokens/";
加载的方法NMLeveldbStateStoreService
loadContainersState – 目的在于返回一个rcs 对象 ,是一个被我们之前在db中存的信息所填充好的
@Override
public List<RecoveredContainerState> loadContainersState()
throws IOException {
//新建两个容器 ,一个装恢复的state ,一个装没有startRequest需要被移除的
ArrayList<RecoveredContainerState> containers =
new ArrayList<RecoveredContainerState>();
ArrayList<ContainerId> containersToRemove =
new ArrayList<ContainerId>();
//新建迭代器
LeveldbIterator iter = null;
try {
iter = new LeveldbIterator(db); //db传入
iter.seek(bytes(CONTAINERS_KEY_PREFIX)); // "ContainerManager/containers/" 的字节
while (iter.hasNext()) { //遍历
Entry<byte[],byte[]> entry = iter.peekNext(); //迭代出一个kv对
String key = asString(entry.getKey()); //kv对
if (!key.startsWith(CONTAINERS_KEY_PREFIX)) {
break;
}
int idEndPos = key.indexOf('/', CONTAINERS_KEY_PREFIX.length());
if (idEndPos < 0) {
throw new IOException("Unable to determine container in key: " + key);
}
//"ContainerManager/containers/ --应该是从这/ 之后的开始的
ContainerId containerId = ConverterUtils.toContainerId(
key.substring(CONTAINERS_KEY_PREFIX.length(), idEndPos));
String keyPrefix = key.substring(0, idEndPos+1); //从头截到尾
RecoveredContainerState rcs = loadContainerState(containerId,
iter, keyPrefix); //此处调用单独一个container 的load方法 ,传过去id/迭代器/
// Don't load container without StartContainerRequest
if (rcs.startRequest != null) {
containers.add(rcs);
} else {
containersToRemove.add(containerId);
}
}
} catch (DBException e) {
throw new IOException(e);
} finally {
if (iter != null) {
iter.close();
}
}
// remove container without StartContainerRequest
for (ContainerId containerId : containersToRemove) {
LOG.warn("Remove container " + containerId +
" with incomplete records");
try {
removeContainer(containerId);
// TODO: kill and cleanup the leaked container
} catch (IOException e) {
LOG.error("Unable to remove container " + containerId +
" in store", e);
}
}
return containers;
}
loadContainerState
private RecoveredContainerState loadContainerState(ContainerId containerId,
LeveldbIterator iter, String keyPrefix) throws IOException { //传进来id,迭代器,key前缀
RecoveredContainerState rcs = new RecoveredContainerState(); //只new 了一个对象 ,这个对象下面用迭代的方式将他的属性填充起来
rcs.status = RecoveredContainerStatus.REQUESTED; //一开始定义为requested
while (iter.hasNext()) {
Entry<byte[],byte[]> entry = iter.peekNext();
String key = asString(entry.getKey());
if (!key.startsWith(keyPrefix)) {
break;
}
iter.next();
//从迭代器取出的key的字符串, 同一个ocntainerId, 会对应不同的后缀 ,成为不同的key ,可以取出不同 // 的值 ,此处是判断,这个key到底是对应这个container的什么key
String suffix = key.substring(keyPrefix.length()-1); // start with '/'
if (suffix.equals(CONTAINER_REQUEST_KEY_SUFFIX)) {
rcs.startRequest = new StartContainerRequestPBImpl( //第一个是一个大对象
StartContainerRequestProto.parseFrom(entry.getValue()));
} else if (suffix.equals(CONTAINER_DIAGS_KEY_SUFFIX)) {
rcs.diagnostics = asString(entry.getValue());
} else if (suffix.equals(CONTAINER_LAUNCHED_KEY_SUFFIX)) { // /launched
if (rcs.status == RecoveredContainerStatus.REQUESTED) {
rcs.status = RecoveredContainerStatus.LAUNCHED; //status正常都是requested ,如果这个suffix能被读出来,说明status 是launch
}
} else if (suffix.equals(CONTAINER_KILLED_KEY_SUFFIX)) {
rcs.killed = true; //这个对应之前的 isKilled配合request状态可以得出这个container
} else if (suffix.equals(CONTAINER_EXIT_CODE_KEY_SUFFIX)) {
rcs.status = RecoveredContainerStatus.COMPLETED; //走到这个suffix也说明是complete
rcs.exitCode = Integer.parseInt(asString(entry.getValue()));
} else {
throw new IOException("Unexpected container state key: " + key);
}
}
return rcs;
}
存储的方法db
storeContainer
@Override
public void storeContainer(ContainerId containerId,
StartContainerRequest startRequest) throws IOException {
String key = CONTAINERS_KEY_PREFIX + containerId.toString()
+ CONTAINER_REQUEST_KEY_SUFFIX;
try {
db.put(bytes(key), //也就是存了一个对象StartContainerRequest
((StartContainerRequestPBImpl) startRequest).getProto().toByteArray());
} catch (DBException e) {
throw new IOException(e);
}
}
storeContainerDiagnostics
@Override
public void storeContainerDiagnostics(ContainerId containerId,
StringBuilder diagnostics) throws IOException {
String key = CONTAINERS_KEY_PREFIX + containerId.toString()
+ CONTAINER_DIAGS_KEY_SUFFIX;
try {
db.put(bytes(key), bytes(diagnostics.toString()));
} catch (DBException e) {
throw new IOException(e);
}
}
storeContainerLaunched
@Override
public void storeContainerLaunched(ContainerId containerId)
throws IOException {
String key = CONTAINERS_KEY_PREFIX + containerId.toString()
+ CONTAINER_LAUNCHED_KEY_SUFFIX;
try {
db.put(bytes(key), EMPTY_VALUE);
//private static final byte[] EMPTY_VALUE = new byte[0];
} catch (DBException e) {
throw new IOException(e);
}
}
storeContainerKilled
@Override
public void storeContainerKilled(ContainerId containerId)
throws IOException {
String key = CONTAINERS_KEY_PREFIX + containerId.toString()
+ CONTAINER_KILLED_KEY_SUFFIX;
try {
db.put(bytes(key), EMPTY_VALUE);
} catch (DBException e) {
throw new IOException(e);
}
}
storeContainerCompleted
@Override
public void storeContainerCompleted(ContainerId containerId,
int exitCode) throws IOException {
String key = CONTAINERS_KEY_PREFIX + containerId.toString()
+ CONTAINER_EXIT_CODE_KEY_SUFFIX;
try {
db.put(bytes(key), bytes(Integer.toString(exitCode)));
} catch (DBException e) {
throw new IOException(e);
}
}
存储的方法在什么时候被调用
storeContainer调用处 --ContainerManagerImpl
@SuppressWarnings("unchecked")
private void startContainerInternal(NMTokenIdentifier nmTokenIdentifier,
ContainerTokenIdentifier containerTokenIdentifier,
StartContainerRequest request) throws YarnException, IOException {
/*
* 1) It should save the NMToken into NMTokenSecretManager. This is done
* here instead of RPC layer because at the time of opening/authenticating
* the connection it doesn't know what all RPC calls user will make on it.
* Also new NMToken is issued only at startContainer (once it gets renewed).
*
* 2) It should validate containerToken. Need to check below things. a) It
* is signed by correct master key (part of retrieve password). b) It
* belongs to correct Node Manager (part of retrieve password). c) It has
* correct RMIdentifier. d) It is not expired.
*/
authorizeStartRequest(nmTokenIdentifier, containerTokenIdentifier);
if (containerTokenIdentifier.getRMIdentifier() != nodeStatusUpdater
.getRMIdentifier()) {
// Is the container coming from unknown RM
StringBuilder sb = new StringBuilder("\nContainer ");
sb.append(containerTokenIdentifier.getContainerID().toString())
.append(" rejected as it is allocated by a previous RM");
throw new InvalidContainerException(sb.toString());
}
// update NMToken
updateNMTokenIdentifier(nmTokenIdentifier);
ContainerId containerId = containerTokenIdentifier.getContainerID();
String containerIdStr = containerId.toString();
String user = containerTokenIdentifier.getApplicationSubmitter();
LOG.info("Start request for " + containerIdStr + " by user " + user);
ContainerLaunchContext launchContext = request.getContainerLaunchContext();
Map<String, ByteBuffer> serviceData = getAuxServiceMetaData();
if (launchContext.getServiceData()!=null &&
!launchContext.getServiceData().isEmpty()) {
for (Map.Entry<String, ByteBuffer> meta : launchContext.getServiceData()
.entrySet()) {
if (null == serviceData.get(meta.getKey())) {
throw new InvalidAuxServiceException("The auxService:" + meta.getKey()
+ " does not exist");
}
}
}
Credentials credentials = parseCredentials(launchContext);
Container container =
new ContainerImpl(getConfig(), this.dispatcher,
context.getNMStateStore(), launchContext,
credentials, metrics, containerTokenIdentifier);
ApplicationId applicationID =
containerId.getApplicationAttemptId().getApplicationId();
if (context.getContainers().putIfAbsent(containerId, container) != null) {
NMAuditLogger.logFailure(user, AuditConstants.START_CONTAINER,
"ContainerManagerImpl", "Container already running on this node!",
applicationID, containerId);
throw RPCUtil.getRemoteException("Container " + containerIdStr
+ " already is running on this node!!");
}
this.readLock.lock();
try {
if (!serviceStopped) {
// Create the application
Application application =
new ApplicationImpl(dispatcher, user, applicationID, credentials, context);
if (null == context.getApplications().putIfAbsent(applicationID,
application)) {
LOG.info("Creating a new application reference for app " + applicationID);
LogAggregationContext logAggregationContext =
containerTokenIdentifier.getLogAggregationContext();
Map<ApplicationAccessType, String> appAcls =
container.getLaunchContext().getApplicationACLs();
context.getNMStateStore().storeApplication(applicationID,
buildAppProto(applicationID, user, credentials, appAcls,
logAggregationContext));
dispatcher.getEventHandler().handle(
new ApplicationInitEvent(applicationID, appAcls,
logAggregationContext));
}
this.context.getNMStateStore().storeContainer(containerId, request);
dispatcher.getEventHandler().handle(
new ApplicationContainerInitEvent(container));
this.context.getContainerTokenSecretManager().startContainerSuccessful(
containerTokenIdentifier);
NMAuditLogger.logSuccess(user, AuditConstants.START_CONTAINER,
"ContainerManageImpl", applicationID, containerId);
// TODO launchedContainer misplaced -> doesn't necessarily mean a container
// launch. A finished Application will not launch containers.
metrics.launchedContainer();
metrics.allocateContainer(containerTokenIdentifier.getResource());
} else {
throw new YarnException(
"Container start failed as the NodeManager is " +
"in the process of shutting down");
}
} finally {
this.readLock.unlock();
}
}
storeContainerDiagnostics – ContainerImpl
1
private void addDiagnostics(String... diags) {
for (String s : diags) {
this.diagnostics.append(s);
}
if (diagnostics.length() > diagnosticsMaxSize) {
LOG.warn("Truncate large diagnostic info, containerId: "
+ containerId + " diagnostic info: " + diagnostics);
diagnostics.delete(0, diagnostics.length() - diagnosticsMaxSize);
}
try {
stateStore.storeContainerDiagnostics(containerId, diagnostics);
} catch (IOException e) {
LOG.warn("Unable to update diagnostics in state store for "
+ containerId, e);
}
}
2
/**
* Update diagnostics, staying in the same state.
*/
static class ContainerDiagnosticsUpdateTransition implements
SingleArcTransition<ContainerImpl, ContainerEvent> {
@Override
public void transition(ContainerImpl container, ContainerEvent event) {
ContainerDiagnosticsUpdateEvent updateEvent =
(ContainerDiagnosticsUpdateEvent) event;
container.addDiagnostics(updateEvent.getDiagnosticsUpdate(), "\n");
try {
container.stateStore.storeContainerDiagnostics(container.containerId,
container.diagnostics);
} catch (IOException e) {
LOG.warn("Unable to update state store diagnostics for "
+ container.containerId, e);
}
}
}
storeContainerLaunched /storeContainerCompleted --ContainerLaunch
@Override
@SuppressWarnings("unchecked") // dispatcher not typed
public Integer call() {
if(containerType != null && containerType.toLowerCase().equals("docker")) {
try {
this.reservedSocket.bind(new InetSocketAddress("127.0.0.1" , 0));
} catch (IOException e) {
LOG.error("Can not get available port");
}
}
final ContainerLaunchContext launchContext = container.getLaunchContext();
Map<Path,List<String>> localResources = null;
ContainerId containerID = container.getContainerId();
String containerIdStr = ConverterUtils.toString(containerID);
final List<String> command = launchContext.getCommands();
int ret = -1;
// CONTAINER_KILLED_ON_REQUEST should not be missed if the container
// is already at KILLING
if (container.getContainerState() == ContainerState.KILLING) {
dispatcher.getEventHandler().handle(
new ContainerExitEvent(containerID,
ContainerEventType.CONTAINER_KILLED_ON_REQUEST,
Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
ExitCode.TERMINATED.getExitCode(),
"Container terminated before launch."));
return 0;
}
try {
localResources = container.getLocalizedResources();
if (localResources == null) {
throw RPCUtil.getRemoteException(
"Unable to get local resources when Container " + containerID +
" is at " + container.getContainerState());
}
final String user = container.getUser();
// /// Variable expansion
// Before the container script gets written out.
List<String> newCmds = new ArrayList<String>(command.size());
String appIdStr = app.getAppId().toString();
String relativeContainerLogDir = ContainerLaunch
.getRelativeContainerLogDir(appIdStr, containerIdStr);
Path containerLogDir =
dirsHandler.getLogPathForWrite(relativeContainerLogDir, false);
if (taskLogLimitEnable) {
logdirMonitor = new Thread(new LogdirMonitor(containerLogDir.toString()));
logdirMonitor.setName("logdirMonitorThread");
logdirMonitor.start();
}
for (String str : command) {
// TODO: Should we instead work via symlinks without this grammar?
newCmds.add(expandEnvironment(str, containerLogDir));
}
launchContext.setCommands(newCmds);
Map<String, String> environment = launchContext.getEnvironment();
// Make a copy of env to iterate & do variable expansion
for (Entry<String, String> entry : environment.entrySet()) {
String value = entry.getValue();
value = expandEnvironment(value, containerLogDir);
entry.setValue(value);
}
// /// End of variable expansion
if(containerType != null && containerType.toLowerCase().equals("docker")) {
int port = reservedSocket.getLocalPort();
LOG.info("Reserved available port: " + port);
environment.put("DOCKER_PORT", String.valueOf(port));
String passwd = RandomStringUtils.randomAlphanumeric(6);
environment.put("DOCKER_PASSWORD", String.valueOf(passwd));
environment.put("VPC_HOSTNAME", "vpc-" + containerID.toString().substring(containerID.toString().length() - 3));
}
FileContext lfs = FileContext.getLocalFSFileContext();
Path nmPrivateContainerScriptPath =
dirsHandler.getLocalPathForWrite(
getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR
+ CONTAINER_SCRIPT);
Path nmPrivateTokensPath =
dirsHandler.getLocalPathForWrite(
getContainerPrivateDir(appIdStr, containerIdStr)
+ Path.SEPARATOR
+ String.format(ContainerLocalizer.TOKEN_FILE_NAME_FMT,
containerIdStr));
Path nmPrivateClasspathJarDir =
dirsHandler.getLocalPathForWrite(
getContainerPrivateDir(appIdStr, containerIdStr));
DataOutputStream containerScriptOutStream = null;
DataOutputStream tokensOutStream = null;
// Select the working directory for the container
Path containerWorkDir =
dirsHandler.getLocalPathForWrite(ContainerLocalizer.USERCACHE
+ Path.SEPARATOR + user + Path.SEPARATOR
+ ContainerLocalizer.APPCACHE + Path.SEPARATOR + appIdStr
+ Path.SEPARATOR + containerIdStr,
LocalDirAllocator.SIZE_UNKNOWN, false);
String pidFileSubpath = getPidFileSubpath(appIdStr, containerIdStr);
// pid file should be in nm private dir so that it is not
// accessible by users
pidFilePath = dirsHandler.getLocalPathForWrite(pidFileSubpath);
List<String> localDirs = dirsHandler.getLocalDirs();
List<String> logDirs = dirsHandler.getLogDirs();
Path localPath = new Path(ContainerLocalizer.USERCACHE + Path.SEPARATOR
+ user + Path.SEPARATOR
+ ContainerLocalizer.APPCACHE + Path.SEPARATOR
+ appIdStr + Path.SEPARATOR
+ containerID + Path.SEPARATOR
+ "jobConfDir");
Path jobConfDir = null;
try {
jobConfDir = dirsHandler.getLocalPathToRead(localPath.toString());
} catch (IOException e) {
LOG.warn(e.getMessage());
}
if (jobConfDir != null) {
File root = new File(jobConfDir.toString());
File[] files = root.listFiles();
if (files != null) {
for (File file : files) {
if (file.isDirectory() && file.getName().startsWith("jobconf_")) {
File[] subFiles = file.listFiles();
if (subFiles != null) {
for (File subFile : subFiles) {
List<String> xmlFileLink = new ArrayList<String>();
Path xmlFile = new Path(subFile.getPath());
xmlFileLink.add(subFile.getName());
localResources.put(xmlFile, xmlFileLink);
}
}
}
}
}
}
List<String> containerLogDirs = new ArrayList<String>();
for( String logDir : logDirs) {
containerLogDirs.add(logDir + Path.SEPARATOR + relativeContainerLogDir);
}
if (!dirsHandler.areDisksHealthy()) {
ret = ContainerExitStatus.DISKS_FAILED;
throw new IOException("Most of the disks failed. "
+ dirsHandler.getDisksHealthReport(false));
}
try {
// /// Write out the container-script in the nmPrivate space.
List<Path> appDirs = new ArrayList<Path>(localDirs.size());
for (String localDir : localDirs) {
Path usersdir = new Path(localDir, ContainerLocalizer.USERCACHE);
Path userdir = new Path(usersdir, user);
Path appsdir = new Path(userdir, ContainerLocalizer.APPCACHE);
appDirs.add(new Path(appsdir, appIdStr));
}
containerScriptOutStream =
lfs.create(nmPrivateContainerScriptPath,
EnumSet.of(CREATE, OVERWRITE));
// Set the token location too.
environment.put(
ApplicationConstants.CONTAINER_TOKEN_FILE_ENV_NAME,
new Path(containerWorkDir,
FINAL_CONTAINER_TOKENS_FILE).toUri().getPath());
// Sanitize the container's environment
sanitizeEnv(environment, containerWorkDir, appDirs, containerLogDirs,
localResources, nmPrivateClasspathJarDir);
// Write out the environment
exec.writeLaunchEnv(containerScriptOutStream, environment, localResources,
launchContext.getCommands());
// /// End of writing out container-script
// /// Write out the container-tokens in the nmPrivate space.
tokensOutStream =
lfs.create(nmPrivateTokensPath, EnumSet.of(CREATE, OVERWRITE));
Credentials creds = container.getCredentials();
creds.writeTokenStorageToStream(tokensOutStream);
// /// End of writing out container-tokens
} finally {
IOUtils.cleanup(LOG, containerScriptOutStream, tokensOutStream);
}
// LaunchContainer is a blocking call. We are here almost means the
// container is launched, so send out the event.
dispatcher.getEventHandler().handle(new ContainerEvent(
containerID,
ContainerEventType.CONTAINER_LAUNCHED));
context.getNMStateStore().storeContainerLaunched(containerID);
// Check if the container is signalled to be killed.
if (!shouldLaunchContainer.compareAndSet(false, true)) {
LOG.info("Container " + containerIdStr + " not launched as "
+ "cleanup already called");
ret = ExitCode.TERMINATED.getExitCode();
}
else {
this.reservedSocket.close();
exec.activateContainer(containerID, pidFilePath);
ret = exec.launchContainer(container, nmPrivateContainerScriptPath,
nmPrivateTokensPath, user, appIdStr, containerWorkDir,
localDirs, logDirs, taskLogExceeded);
}
} catch (Throwable e) {
LOG.warn("Failed to launch container.", e);
dirsHandler.activeCheckDirs();
dispatcher.getEventHandler().handle(new ContainerExitEvent(
containerID, ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
e.getMessage()));
return ret;
} finally {
completed.set(true);
exec.deactivateContainer(containerID);
stopped = true;
try {
if (taskLogLimitEnable && logdirMonitor != null) {
logdirMonitor.interrupt();
logdirMonitor.join(THREAD_JOIN_TIMEOUT_MS);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
try {
context.getNMStateStore().storeContainerCompleted(containerID, ret);
} catch (IOException e) {
LOG.error("Unable to set exit code for container " + containerID);
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("Container " + containerIdStr + " completed with exit code "
+ ret);
}
if (ret == ExitCode.FORCE_KILLED.getExitCode()
|| ret == ExitCode.TERMINATED.getExitCode()) {
// If the process was killed, Send container_cleanedup_after_kill and
// just break out of this method.
dispatcher.getEventHandler().handle(
new ContainerExitEvent(containerID,
ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ret,
"Container exited with a non-zero exit code " + ret));
return ret;
}
if (ret != 0) {
dirsHandler.activeCheckDirs();
LOG.warn("Container exited with a non-zero exit code " + ret);
this.dispatcher.getEventHandler().handle(new ContainerExitEvent(
containerID,
ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
"Container exited with a non-zero exit code " + ret));
return ret;
}
LOG.info("Container " + containerIdStr + " succeeded ");
dispatcher.getEventHandler().handle(
new ContainerEvent(containerID,
ContainerEventType.CONTAINER_EXITED_WITH_SUCCESS));
return 0;
}
storeContainerCompleted第二处 --RecoveredContainerLaunch
@SuppressWarnings("unchecked")
@Override
public Integer call() {
int retCode = ExitCode.LOST.getExitCode();
ContainerId containerId = container.getContainerId();
String appIdStr = ConverterUtils.toString(
containerId.getApplicationAttemptId().getApplicationId());
String containerIdStr = ConverterUtils.toString(containerId);
dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
ContainerEventType.CONTAINER_LAUNCHED));
boolean notInterrupted = true;
try {
File pidFile = locatePidFile(appIdStr, containerIdStr);
if (pidFile != null) {
String pidPathStr = pidFile.getPath();
pidFilePath = new Path(pidPathStr);
exec.activateContainer(containerId, pidFilePath);
retCode = exec.reacquireContainer(container.getUser(), containerId);
} else {
LOG.warn("Unable to locate pid file for container " + containerIdStr);
}
} catch (IOException e) {
LOG.error("Unable to recover container " + containerIdStr, e);
} catch (InterruptedException e) {
LOG.warn("Interrupted while waiting for exit code from " + containerId);
notInterrupted = false;
} finally {
if (notInterrupted) {
this.completed.set(true);
exec.deactivateContainer(containerId);
try {
getContext().getNMStateStore().storeContainerCompleted(containerId,
retCode);
} catch (IOException e) {
LOG.error("Unable to set exit code for container " + containerId);
}
}
}
if (retCode != 0) {
LOG.warn("Recovered container exited with a non-zero exit code "
+ retCode);
this.dispatcher.getEventHandler().handle(new ContainerExitEvent(
containerId,
ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, retCode,
"Container exited with a non-zero exit code " + retCode));
return retCode;
}
LOG.info("Recovered container " + containerId + " succeeded");
dispatcher.getEventHandler().handle(
new ContainerEvent(containerId,
ContainerEventType.CONTAINER_EXITED_WITH_SUCCESS));
return 0;
}
storeContainerKilled调用处
1 --ContainerLaunch
@SuppressWarnings("unchecked") // dispatcher not typed
public void cleanupContainer() throws IOException {
ContainerId containerId = container.getContainerId();
String containerIdStr = ConverterUtils.toString(containerId);
LOG.info("Cleaning up container " + containerIdStr);
try {
context.getNMStateStore().storeContainerKilled(containerId);
} catch (IOException e) {
LOG.error("Unable to mark container " + containerId
+ " killed in store", e);
}
// launch flag will be set to true if process already launched
boolean alreadyLaunched = !shouldLaunchContainer.compareAndSet(false, true);
if (!alreadyLaunched) {
LOG.info("Container " + containerIdStr + " not launched."
+ " No cleanup needed to be done");
return;
}
LOG.debug("Marking container " + containerIdStr + " as inactive");
// this should ensure that if the container process has not launched
// by this time, it will never be launched
exec.deactivateContainer(containerId);
if (LOG.isDebugEnabled()) {
LOG.debug("Getting pid for container " + containerIdStr + " to kill"
+ " from pid file "
+ (pidFilePath != null ? pidFilePath.toString() : "null"));
}
// however the container process may have already started
try {
// get process id from pid file if available
// else if shell is still active, get it from the shell
String processId = null;
if (pidFilePath != null) {
processId = getContainerPid(pidFilePath);
}
// kill process
if (processId != null) {
String user = container.getUser();
LOG.debug("Sending signal to pid " + processId
+ " as user " + user
+ " for container " + containerIdStr);
final Signal signal = sleepDelayBeforeSigKill > 0
? Signal.TERM
: Signal.KILL;
boolean result = exec.signalContainer(user, processId, signal);
LOG.debug("Sent signal " + signal + " to pid " + processId
+ " as user " + user
+ " for container " + containerIdStr
+ ", result=" + (result? "success" : "failed"));
if (sleepDelayBeforeSigKill > 0) {
new DelayedProcessKiller(container, user,
processId, sleepDelayBeforeSigKill, Signal.KILL, exec).start();
}
}
if(containerType !=null && containerType.toLowerCase().equals("docker")) {
LOG.info("docker begin to stop container.");
String dockerExecutor = container.getLaunchContext().getEnvironment().get(YarnConfiguration.NM_DOCKER_CONTAINER_EXECUTOR_EXEC_NAME);
if(dockerExecutor == null || dockerExecutor.equals("")) {
dockerExecutor = conf.get(YarnConfiguration.NM_DOCKER_CONTAINER_EXECUTOR_EXEC_NAME,
YarnConfiguration.NM_DEFAULT_DOCKER_CONTAINER_EXECUTOR_EXEC_NAME);
}
if (!new File(dockerExecutor).exists()) {
throw new IllegalStateException("Invalid docker exec path: " + dockerExecutor);
}
String command = dockerExecutor + " stop " + containerId.toString();
LOG.info("command is:" + command);
Runtime rt = Runtime.getRuntime();
final Process stopProcess = rt.exec(command);
int code = stopProcess.waitFor();
LOG.info("docker stoped container, exit code is " + code);
}
} catch (Exception e) {
String message =
"Exception when trying to cleanup container " + containerIdStr
+ ": " + StringUtils.stringifyException(e);
LOG.warn(message);
dispatcher.getEventHandler().handle(
new ContainerDiagnosticsUpdateEvent(containerId, message));
} finally {
// cleanup pid file if present
if (pidFilePath != null) {
FileContext lfs = FileContext.getLocalFSFileContext();
lfs.delete(pidFilePath, false);
lfs.delete(pidFilePath.suffix(EXIT_CODE_FILE_SUFFIX), false);
}
}
}
2 – containerManagerImpl
@SuppressWarnings("unchecked")
private void stopContainerInternal(NMTokenIdentifier nmTokenIdentifier,
ContainerId containerID) throws YarnException, IOException {
String containerIDStr = containerID.toString();
Container container = this.context.getContainers().get(containerID);
LOG.info("Stopping container with container Id: " + containerIDStr);
authorizeGetAndStopContainerRequest(containerID, container, true,
nmTokenIdentifier);
if (container == null) {
if (!nodeStatusUpdater.isContainerRecentlyStopped(containerID)) {
throw RPCUtil.getRemoteException("Container " + containerIDStr
+ " is not handled by this NodeManager");
}
} else {
context.getNMStateStore().storeContainerKilled(containerID);
YarnConfiguration conf = new YarnConfiguration();
long startTime = System.currentTimeMillis();
long sleepTime = 0;
long containerKilledByAmTimeoutInterval = conf
.getLong(YarnConfiguration.CONTAINER_KILLED_BY_AM_TIMEOUT_MONITOR_INTERVAL_MS,
YarnConfiguration.DEFAULT_CONTAINER_KILLED_BY_AM_TIMEOUT_MONITOR_INTERVAL_MS);
long containerKilledByAmTimeout = conf
.getLong(YarnConfiguration.CONTAINER_KILLED_BY_AM_TIMEOUT_MS,
YarnConfiguration.DEFAULT_CONTAINER_KILLED_BY_AM_TIMEOUT_MS);
while (ContainerManagerImpl.this.context.getContainers().get(containerID).getExitCode() == ContainerExitStatus.INVALID) {
sleepTime = System.currentTimeMillis() - startTime;
if ( sleepTime > containerKilledByAmTimeout) {
break;
}
try {
Thread.sleep(containerKilledByAmTimeoutInterval);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
LOG.info("container " + containerID
+ " time for Waiting exitCode is " + sleepTime
+ "ms, with containerKilledByAmTimeoutInterval " + containerKilledByAmTimeoutInterval
+ "ms containerKilledByAmTimeout " + containerKilledByAmTimeout + "ms");
dispatcher.getEventHandler().handle(
new ContainerKillEvent(containerID,
ContainerExitStatus.KILLED_BY_APPMASTER,
"Container killed by the ApplicationMaster."));
NMAuditLogger.logSuccess(container.getUser(),
AuditConstants.STOP_CONTAINER, "ContainerManageImpl", containerID
.getApplicationAttemptId().getApplicationId(), containerID);
// TODO: Move this code to appropriate place once kill_container is
// implemented.
nodeStatusUpdater.sendOutofBandHeartBeat();
}
}
关于launchContainer方法
-
ContainerManagerImpl 的recoverContainer 在 loadContainer最后
if (context.getApplications().containsKey(appId)) { Credentials credentials = parseCredentials(launchContext); Container container = new ContainerImpl(getConfig(), dispatcher, context.getNMStateStore(), req.getContainerLaunchContext(), credentials, metrics, token, rcs.getStatus(), rcs.getExitCode(), rcs.getDiagnostics(), rcs.getKilled()); context.getContainers().put(containerId, container); dispatcher.getEventHandler().handle( //new ContainerImpl之后发送appInitEvent事件,并且将container传入 new ApplicationContainerInitEvent(container)); } else { if (rcs.getStatus() != RecoveredContainerStatus.COMPLETED) { LOG.warn(containerId + " has no corresponding application!"); } LOG.info("Adding " + containerId + " to recently stopped containers"); nodeStatusUpdater.addCompletedContainer(containerId); } //关于这个handle 事件 ,
-
AppInitContainer事件
/** * Event sent from {@link ContainerManagerImpl} to {@link ApplicationImpl} to * request the initialization of a container. This is funneled through * the Application so that the application life-cycle can be checked, and container * launches can be delayed until the application is fully initialized. * * Once the application is initialized, * {@link ApplicationImpl.InitContainerTransition} simply passes this event on as a * {@link ContainerInitEvent}. * */ //可以看出ApplicationImpl 会处理这样一个来自于ContainerManagerImpl的事件---从注释 //我点进来只看到这个事件的定义 ,但是并没有看到处理的过程 ,处理并不是在所谓的super里 //如同上面的dispatcher.getEventHandler.handle(事件)--我现在看见的是事件,真正触发事件的是这个 //handle ,我点进来事件可以看上上面 注释的谁来handle这个事件,这才是分析的思路 ,ApplicationImpl //就是处理者 public class ApplicationContainerInitEvent extends ApplicationEvent { final Container container; public ApplicationContainerInitEvent(Container container) { //此处触发了INIT_CONTAINER事件 super(container.getContainerId().getApplicationAttemptId() .getApplicationId(), ApplicationEventType.INIT_CONTAINER); this.container = container; } Container getContainer() { return container; } }
-
ApplicationImpl 重点handle 方法
@Override public void handle(ApplicationEvent event) { this.writeLock.lock(); try { ApplicationId applicationID = event.getApplicationID(); LOG.debug("Processing " + applicationID + " of type " + event.getType()); ApplicationState oldState = stateMachine.getCurrentState(); ApplicationState newState = null; try { // queue event requesting init of the same app newState = stateMachine.doTransition(event.getType(), event); //init走完 } catch (InvalidStateTransitonException e) { LOG.warn("Can't handle this event at current state", e); } if (oldState != newState) { //打印了这句话 /Application application_1551242356703_0003 transitioned from INITING toRUNNING 如果是running状态 ,就会走下面transition中init_done 到running状态 LOG.info("Application " + applicationID + " transitioned from " + oldState + " to " + newState); } } finally { this.writeLock.unlock(); } }
-
关注一下可以看出ApplicationImpl中 INIT_CONTAINER 事件处理
private static StateMachineFactory<ApplicationImpl, ApplicationState, ApplicationEventType, ApplicationEvent> stateMachineFactory = new StateMachineFactory<ApplicationImpl, ApplicationState, ApplicationEventType, ApplicationEvent>(ApplicationState.NEW) // Transitions from NEW state .addTransition(ApplicationState.NEW, ApplicationState.INITING, ApplicationEventType.INIT_APPLICATION, new AppInitTransition()) .addTransition(ApplicationState.NEW, ApplicationState.NEW, ApplicationEventType.INIT_CONTAINER, new InitContainerTransition()) // 其实一般就是看到INIT_CONTAINER这个事件下面调用了什么方法,此处也就是这个new InitContainerTransition // Transitions from RUNNING state .addTransition(ApplicationState.RUNNING, ApplicationState.RUNNING, ApplicationEventType.INIT_CONTAINER, new InitContainerTransition())
4 关注appImpl中的 new InitContainerTransition
@SuppressWarnings("unchecked") static class InitContainerTransition implements SingleArcTransition<ApplicationImpl, ApplicationEvent> { @Override public void transition(ApplicationImpl app, ApplicationEvent event) { ApplicationContainerInitEvent initEvent = (ApplicationContainerInitEvent) event; //event强转成appInitEvent,从子到父再到子 Container container = initEvent.getContainer(); //传过来的container app.containers.put(container.getContainerId(), container); LOG.info("Adding " + container.getContainerId() + " to application " + app.toString()); switch (app.getApplicationState()) { case RUNNING: //根据日志打印 // DEBUG:case RUNNING: app.dispatcher.getEventHandler().handle(new ContainerInitEvent( // container.getContainerId())); +containerId= container_1551274324796_0004_01_000053 app.dispatcher.getEventHandler().handle(new ContainerInitEvent( container.getContainerId())); break; case INITING: case NEW: // these get queued up and sent out in AppInitDoneTransition break; default: assert false : "Invalid state for InitContainerTransition: " + app.getApplicationState(); } } }
5 ContainerImpl的状态机
private static StateMachineFactory <ContainerImpl, ContainerState, ContainerEventType, ContainerEvent> stateMachineFactory = new StateMachineFactory<ContainerImpl, ContainerState, ContainerEventType, ContainerEvent>(ContainerState.NEW) // From NEW State .addTransition(ContainerState.NEW, EnumSet.of(ContainerState.LOCALIZING, ContainerState.LOCALIZED, ContainerState.LOCALIZATION_FAILED, ContainerState.DONE), ContainerEventType.INIT_CONTAINER, new RequestResourcesTransition()) //找资源初始化事件
6去containerImpl中关注INIT_CONTAINER
/** * State transition when a NEW container receives the INIT_CONTAINER * message. * * If there are resources to localize, sends a * ContainerLocalizationRequest (INIT_CONTAINER_RESOURCES) * to the ResourceLocalizationManager and enters LOCALIZING state. * * If there are no resources to localize, sends LAUNCH_CONTAINER event * and enters LOCALIZED state directly. * * If there are any invalid resources specified, enters LOCALIZATION_FAILED * directly. */ @SuppressWarnings("unchecked") // dispatcher not typed static class RequestResourcesTransition implements MultipleArcTransition<ContainerImpl,ContainerEvent,ContainerState> { @Override public ContainerState transition(ContainerImpl container, ContainerEvent event) { if (container.recoveredStatus == RecoveredContainerStatus.COMPLETED) { //此处开始筛选 completed LOG.info("DEBUG: static class RequestResourcesTransitioncontainer.recoveredStatus == RecoveredContainerStatus.COMPLETED ,,,container.sendFinishedEvents();"); container.sendFinishedEvents(); return ContainerState.DONE; //走到此处方法结束 } else if (container.recoveredAsKilled && //被kill + requested container.recoveredStatus == RecoveredContainerStatus.REQUESTED) { LOG.info("DEBUG:static class RequestResourcesTransitioncontainer.recoveredAsKilled &&\n" + " container.recoveredStatus == RecoveredContainerStatus.REQUESTED,,,container.metrics.killedContainer();"); // container was killed but never launched container.metrics.killedContainer(); NMAuditLogger.logSuccess(container.user, AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl", container.containerId.getApplicationAttemptId().getApplicationId(), container.containerId); container.metrics.releaseContainer(container.resource); container.sendFinishedEvents(); return ContainerState.DONE; //走到此处方法结束 } final ContainerLaunchContext ctxt = container.launchContext; container.metrics.initingContainer(); container.dispatcher.getEventHandler().handle(new AuxServicesEvent (AuxServicesEventType.CONTAINER_INIT, container)); // Inform the AuxServices about the opaque serviceData Map<String,ByteBuffer> csd = ctxt.getServiceData(); if (csd != null) { // This can happen more than once per Application as each container may // have distinct service data for (Map.Entry<String,ByteBuffer> service : csd.entrySet()) { container.dispatcher.getEventHandler().handle( new AuxServicesEvent(AuxServicesEventType.APPLICATION_INIT, container.user, container.containerId .getApplicationAttemptId().getApplicationId(), service.getKey().toString(), service.getValue())); } } // Send requests for public, private resources Map<String,LocalResource> cntrRsrc = ctxt.getLocalResources(); //被恢复的container 是有这个信息的 if (!cntrRsrc.isEmpty()) { //这个是else 对应if ,如果cntRsrc不是空的 try { //会走进来 for (Map.Entry<String,LocalResource> rsrc : cntrRsrc.entrySet()) { try { LocalResourceRequest req; if (rsrc.getValue().getResource() != null) { req = new LocalResourceRequest(rsrc.getValue()); } else { continue; } List<String> links = container.pendingResources.get(req); if (links == null) { links = new ArrayList<String>(); container.pendingResources.put(req, links); } links.add(rsrc.getKey()); storeSharedCacheUploadPolicy(container, req, rsrc.getValue() .getShouldBeUploadedToSharedCache()); switch (rsrc.getValue().getVisibility()) { case PUBLIC: container.publicRsrcs.add(req); break; case PRIVATE: container.privateRsrcs.add(req); break; case APPLICATION: container.appRsrcs.add(req); break; } } catch (URISyntaxException e) { LOG.info("Got exception parsing " + rsrc.getKey() + " and value " + rsrc.getValue()); throw e; } } } catch (URISyntaxException e) { // malformed resource; abort container launch LOG.warn("Failed to parse resource-request", e); container.cleanup(); container.metrics.endInitingContainer(); return ContainerState.LOCALIZATION_FAILED; } Map<LocalResourceVisibility, Collection<LocalResourceRequest>> req = new LinkedHashMap<LocalResourceVisibility, Collection<LocalResourceRequest>>(); if (!container.publicRsrcs.isEmpty()) { req.put(LocalResourceVisibility.PUBLIC, container.publicRsrcs); } if (!container.privateRsrcs.isEmpty()) { req.put(LocalResourceVisibility.PRIVATE, container.privateRsrcs); } if (!container.appRsrcs.isEmpty()) { req.put(LocalResourceVisibility.APPLICATION, container.appRsrcs); } container.dispatcher.getEventHandler().handle( new ContainerLocalizationRequestEvent(container, req)); return ContainerState.LOCALIZING; } else { container.sendLaunchEvent(); //并没有走到此处 container.metrics.endInitingContainer(); return ContainerState.LOCALIZED; } } }
重启时调用launchContainer方法的地方
/**
* Transition when one of the requested resources for this container
* has been successfully localized.
*/
static class LocalizedTransition implements
MultipleArcTransition<ContainerImpl,ContainerEvent,ContainerState> {
@SuppressWarnings("unchecked")
@Override
public ContainerState transition(ContainerImpl container,
ContainerEvent event) {
LOG.info("DEBUG: We come to LocalizedTransition ");
ContainerResourceLocalizedEvent rsrcEvent = (ContainerResourceLocalizedEvent) event;
LocalResourceRequest resourceRequest = rsrcEvent.getResource();
Path location = rsrcEvent.getLocation();
List<String> syms = container.pendingResources.remove(resourceRequest);
if (null == syms) {
LOG.warn("Localized unknown resource " + resourceRequest +
" for container " + container.containerId);
assert false;
// fail container?
return ContainerState.LOCALIZING;
}
container.localizedResources.put(location, syms);
// check to see if this resource should be uploaded to the shared cache
// as well
if (shouldBeUploadedToSharedCache(container, resourceRequest)) {
container.resourcesToBeUploaded.put(resourceRequest, location);
}
if (!container.pendingResources.isEmpty()) {
return ContainerState.LOCALIZING;
}
LOG.info("DEBUG: LocalizedTransition: container.dispatcher.getEventHandler().handle(\n" +
" new ContainerLocalizationEvent(LocalizationEventType.\n" +
" CONTAINER_RESOURCES_LOCALIZED, container));");
container.dispatcher.getEventHandler().handle(
new ContainerLocalizationEvent(LocalizationEventType.
CONTAINER_RESOURCES_LOCALIZED, container));
//所有本来launch的状态的恢复后会走这里
LOG.info("DEBUG : LocalizedTransition : container.sendLaunchEvent(); ContainerId = "+container.getContainerId() +
"containerNMStatus = " +container.getNMContainerStatus() + " containerState = " + container.getContainerState());
// LAUNCHED 状态恢复以后会走到的部分
container.sendLaunchEvent();
container.metrics.endInitingContainer();
// If this is a recovered container that has already launched, skip
// uploading resources to the shared cache. We do this to avoid uploading
// the same resources multiple times. The tradeoff is that in the case of
// a recovered container, there is a chance that resources don't get
// uploaded into the shared cache. This is OK because resources are not
// acknowledged by the SCM until they have been uploaded by the node
// manager.
if (container.recoveredStatus != RecoveredContainerStatus.LAUNCHED
&& container.recoveredStatus != RecoveredContainerStatus.COMPLETED) {
// kick off uploads to the shared cache
container.dispatcher.getEventHandler().handle(
new SharedCacheUploadEvent(container.resourcesToBeUploaded, container
.getLaunchContext(), container.getUser(),
SharedCacheUploadEventType.UPLOAD));
}
return ContainerState.LOCALIZED;
}
}
@SuppressWarnings("unchecked") // dispatcher not typed
private void sendLaunchEvent() {
ContainersLauncherEventType launcherEvent =
ContainersLauncherEventType.LAUNCH_CONTAINER;
if (recoveredStatus == RecoveredContainerStatus.LAUNCHED) {
LOG.info("DEBUG: sendLaunchEvent(),,,recoveredStatus == RecoveredContainerStatus.LAUNCHED containerId = "+ containerId);
// try to recover a container that was previously launched
launcherEvent = ContainersLauncherEventType.RECOVER_CONTAINER;
}
containerLaunchStartTime = clock.getTime();
dispatcher.getEventHandler().handle(
new ContainersLauncherEvent(this, launcherEvent));
}
关于ContainerImpl中sendFinishEvent()
首先看sendFinishEvent
private void sendFinishedEvents() {
// Inform the application
@SuppressWarnings("rawtypes")
EventHandler eventHandler = dispatcher.getEventHandler();
eventHandler.handle(new ApplicationContainerFinishedEvent(containerId));
// Remove the container from the resource-monitor
LOG.info("DEBUG: eventHandler.handle(new ApplicationContainerFinishedEvent(containerId));: sendFinishEvents(): containerId = "+containerId);
eventHandler.handle(new ContainerStopMonitoringEvent(containerId));
// Tell the logService too
eventHandler.handle(new LogHandlerContainerFinishedEvent(
containerId, exitCode));
}
然后看是何处调用了sendFinishEvent,其实就是状态机中的一小步 ,在recover过程中, 如果发现status的判断是complete ,就会发送
@SuppressWarnings("unchecked") // dispatcher not typed
static class RequestResourcesTransition implements
MultipleArcTransition<ContainerImpl,ContainerEvent,ContainerState> {
@Override
public ContainerState transition(ContainerImpl container,
ContainerEvent event) {
if (container.recoveredStatus == RecoveredContainerStatus.COMPLETED) {
LOG.info("DEBUG: static class RequestResourcesTransitioncontainer.recoveredStatus == RecoveredContainerStatus.COMPLETED ,,,container.sendFinishedEvents();");
container.sendFinishedEvents();
return ContainerState.DONE;
} else if (container.recoveredAsKilled &&
container.recoveredStatus == RecoveredContainerStatus.REQUESTED) {
LOG.info("DEBUG:static class RequestResourcesTransitioncontainer.recoveredAsKilled &&\n" +
" container.recoveredStatus == RecoveredContainerStatus.REQUESTED,,,container.metrics.killedContainer();");
// container was killed but never launched
container.metrics.killedContainer();
NMAuditLogger.logSuccess(container.user,
AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl",
container.containerId.getApplicationAttemptId().getApplicationId(),
container.containerId);
container.metrics.releaseContainer(container.resource);
container.sendFinishedEvents();
return ContainerState.DONE;
}
final ContainerLaunchContext ctxt = container.launchContext;
container.metrics.initingContainer();
container.dispatcher.getEventHandler().handle(new AuxServicesEvent
(AuxServicesEventType.CONTAINER_INIT, container));
// Inform the AuxServices about the opaque serviceData
Map<String,ByteBuffer> csd = ctxt.getServiceData();
if (csd != null) {
// This can happen more than once per Application as each container may
// have distinct service data
for (Map.Entry<String,ByteBuffer> service : csd.entrySet()) {
container.dispatcher.getEventHandler().handle(
new AuxServicesEvent(AuxServicesEventType.APPLICATION_INIT,
container.user, container.containerId
.getApplicationAttemptId().getApplicationId(),
service.getKey().toString(), service.getValue()));
}
}
// Send requests for public, private resources
Map<String,LocalResource> cntrRsrc = ctxt.getLocalResources();
if (!cntrRsrc.isEmpty()) {
try {
for (Map.Entry<String,LocalResource> rsrc : cntrRsrc.entrySet()) {
try {
LocalResourceRequest req;
if (rsrc.getValue().getResource() != null) {
req = new LocalResourceRequest(rsrc.getValue());
} else {
continue;
}
List<String> links = container.pendingResources.get(req);
if (links == null) {
links = new ArrayList<String>();
container.pendingResources.put(req, links);
}
links.add(rsrc.getKey());
storeSharedCacheUploadPolicy(container, req, rsrc.getValue()
.getShouldBeUploadedToSharedCache());
switch (rsrc.getValue().getVisibility()) {
case PUBLIC:
container.publicRsrcs.add(req);
break;
case PRIVATE:
container.privateRsrcs.add(req);
break;
case APPLICATION:
container.appRsrcs.add(req);
break;
}
} catch (URISyntaxException e) {
LOG.info("Got exception parsing " + rsrc.getKey()
+ " and value " + rsrc.getValue());
throw e;
}
}
} catch (URISyntaxException e) {
// malformed resource; abort container launch
LOG.warn("Failed to parse resource-request", e);
container.cleanup();
container.metrics.endInitingContainer();
return ContainerState.LOCALIZATION_FAILED;
}
Map<LocalResourceVisibility, Collection<LocalResourceRequest>> req =
new LinkedHashMap<LocalResourceVisibility,
Collection<LocalResourceRequest>>();
if (!container.publicRsrcs.isEmpty()) {
req.put(LocalResourceVisibility.PUBLIC, container.publicRsrcs);
}
if (!container.privateRsrcs.isEmpty()) {
req.put(LocalResourceVisibility.PRIVATE, container.privateRsrcs);
}
if (!container.appRsrcs.isEmpty()) {
req.put(LocalResourceVisibility.APPLICATION, container.appRsrcs);
}
container.dispatcher.getEventHandler().handle(
new ContainerLocalizationRequestEvent(container, req));
return ContainerState.LOCALIZING;
} else {
container.sendLaunchEvent();
container.metrics.endInitingContainer();
return ContainerState.LOCALIZED;
}
}
}
DEBUG过程
storeContainer中
StartRequest对象
container_launch_context { localResources { key: "job.jar" value { resource { scheme: "hdfs" host: "jlnamenode1v.sys.lyct.qihoo.net" port: 9000 file: "/home/yarn/staging_wxc/yarn/.staging/job_1551148219423_0001/job.jar" } size: 3411 timestamp: 1551148802895 type: PATTERN visibility: APPLICATION pattern: "(?:classes/|lib/).*" } } localResources { key: "job.xml" value { resource { scheme: "hdfs" host: "jlnamenode1v.sys.lyct.qihoo.net" port: 9000 file: "/home/yarn/staging_wxc/yarn/.staging/job_1551148219423_0001/job.xml" } size: 108329 timestamp: 1551148804894 type: FILE visibility: APPLICATION } } tokens: "HDTS\000\001\bJobToken\027\026job_1551148219423_0001\024\\f5\n\273;\253\327f\232\231\2678\020X\241i\203@Z\rmapreduce.job\026job_1551148219423_0001\001\025MapReduceShuffleToken\b<\312\036\244\243\235\200\b" service_data { key: "mapreduce_shuffle" value: "\027\026job_1551148219423_0001\b<\312\036\244\243\235\200\b\rmapreduce.job\026job_1551148219423_0001" } environment { key: "HADOOP_CLIENT_OPTS" value: "-server -Xms5120m -Xmx5120m -XX:PermSize=1g -XX:MaxPermSize=1g " } environment { key: "SHELL" value: "/bin/bash" } environment { key: "HADOOP_CLASSPATH" value: "$PWD:job.jar/job.jar:job.jar/classes/:job.jar/lib/*:$PWD/*" } environment { key: "CLASSPATH" value: "$PWD:$HADOOP_SPINNER_CORE_DIR:$HADOOP_CONF_DIR:$HADOOP_COMMON_HOME/share/hadoop/common/*:$HADOOP_COMMON_HOME/share/hadoop/common/lib/*:$HADOOP_HDFS_HOME/share/hadoop/hdfs/*:$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*:$HADOOP_YARN_HOME/share/hadoop/yarn/*:$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*:job.jar/job.jar:job.jar/classes/:job.jar/lib/*:$PWD/*" } environment { key: "HADOOP_ROOT_LOGGER" value: "INFO,console" } environment { key: "LD_LIBRARY_PATH" value: "$PWD:/home/yarn/software/hadoop/lib/native" } environment { key: "STDOUT_LOGFILE_ENV" value: "<LOG_DIR>/stdout" } environment { key: "STDERR_LOGFILE_ENV" value: "<LOG_DIR>/stderr" } command: "$JAVA_HOME/bin/java -Djava.net.preferIPv4Stack=true -Dhadoop.metrics.log.level=WARN -Xmx1024m -Djava.io.tmpdir=$PWD/tmp -Dlog4j.configuration=container-log4j.properties -Dyarn.app.container.log.dir=<LOG_DIR> -Dyarn.app.container.log.filesize=0 -Dhadoop.root.logger=INFO,CLA -Dhadoop.root.logfile=syslog org.apache.hadoop.mapred.YarnChild 10.198.66.225 36018 attempt_1551148219423_0001_m_000013_1000 15 1><LOG_DIR>/stdout 2><LOG_DIR>/stderr " application_ACLs { accessType: APPACCESS_MODIFY_APP acl: " " } application_ACLs { accessType: APPACCESS_VIEW_APP acl: " " } } container_token { identifier: "\n\021\022\r\n\t\b\001\020\237\370\215\275\222-\020\002\030\017\022\02210.198.66.225:8842\032\004yarn\"\a\b\200\f\020\001\030\000(\304\316\346\275\222-0\303\365\357\"8\237\370\215\275\222-B\002\b\024H\221\367\301\275\222-" password: "\ts\261\277\310i\356\301\v\245\234\311\345G\223\017\f\250\345U" kind: "ContainerToken" service: "10.198.66.225:8842" }
AMLauncher -> startContainerRequest -> STARTContainerInternal -> storeContainer
打印日志storeContainer
2019-02-26 16:05:11,902 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl: Start request for container_1551155033308_0002_01_000002 by user yarn
2019-02-26 16:05:11,903 INFO org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService: DEBUG: storecontainer : KEY= ContainerManager/containers/container_1551155033308_0002_01_000002/request
2019-02-26 16:05:11,903 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl: Adding container_1551155033308_0002_01_000002 to application application_1551155033308_0002
2019-02-26 16:05:11,903 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl: Container container_1551155033308_0002_01_000002 transitioned from NEW to LOCALIZING
2019-02-26 16:05:11,904 INFO org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger: USER=yarn IP=10.198.66.225 OPERATION=Start Container Request TARGET=ContainerManageImpl RESULT=SUCCESS APPID=application_1551155033308_0002 CONTAINERID=container_1551155033308_0002_01_000002
2019-02-26 16:05:12,749 INFO org.apache.spark.network.yarn.YarnShuffleService: Initializing container container_1551155033308_0002_01_000002
2019-02-26 16:05:13,422 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService: Unpack or rename file from /data01/yarn/nm-local-dir/usercache/yarn/appcache/application_1551155033308_0002/container_1551155033308_0002_01_000002/jobConfDir/tmp_job.xml to file:/data01/yarn/nm-local-dir/usercache/yarn/appcache/application_1551155033308_0002/container_1551155033308_0002_01_000002/jobConfDir/jobconf_job.xml
2019-02-26 16:05:13,422 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService: Rename /data01/yarn/nm-local-dir/usercache/yarn/appcache/application_1551155033308_0002/container_1551155033308_0002_01_000002/jobConfDir/tmp_job.xml to /data01/yarn/nm-local-dir/usercache/yarn/appcache/application_1551155033308_0002/container_1551155033308_0002_01_000002/jobConfDir/jobconf_job.xml
2019-02-26 16:05:13,469 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl: Container container_1551155033308_0002_01_000002 transitioned from LOCALIZING to LOCALIZED
2019-02-26 16:05:13,514 INFO org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService: DEBUG: storeContainerLaunched : KEY = ContainerManager/containers/container_1551155033308_0002_01_000002/launched
2019-02-26 16:05:13,519 INFO org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor: launchContainer: [bash, /data01/yarn/nm-local-dir/usercache/yarn/appcache/application_1551155033308_0002/container_1551155033308_0002_01_000002/default_container_executor.sh]
2019-02-26 16:05:17,482 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl: Stopping container with container Id: container_1551155033308_0002_01_000002
2019-02-26 16:05:17,483 INFO org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService: DEBUG: storeContainerKilled : KEY = ContainerManager/containers/container_1551155033308_0002_01_000002/killed
2019-02-26 16:05:17,590 INFO org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService: DEBUG: storeContainerCompleted : KEY = ContainerManager/containers/container_1551155033308_0002_01_000002/exitcode
2019-02-26 16:05:17,591 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch: Container container_1551155033308_0002_01_000002 succeeded
2019-02-26 16:05:20,504 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl: container container_1551155033308_0002_01_000002 time for Waiting exitCode is 3021ms, with containerKilledByAmTimeoutInterval 500ms containerKilledByAmTimeout 3000ms
2019-02-26 16:05:20,504 INFO org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger: USER=yarn IP=10.198.66.225 OPERATION=Stop Container Request TARGET=ContainerManageImpl RESULT=SUCCESS APPID=application_1551155033308_0002 CONTAINERID=container_1551155033308_0002_01_000002
关于AM可以远程调用的StartContainers
startContainers – ContainerManagerImpl–此方法中for循环调用
/**
* Start a list of containers on this NodeManager.
*/
@Override
public StartContainersResponse
startContainers(StartContainersRequest requests) throws YarnException,
IOException {
if (blockNewContainerRequests.get()) {
throw new NMNotYetReadyException(
"Rejecting new containers as NodeManager has not"
+ " yet connected with ResourceManager");
}
UserGroupInformation remoteUgi = getRemoteUgi();
NMTokenIdentifier nmTokenIdentifier = selectNMTokenIdentifier(remoteUgi);
authorizeUser(remoteUgi,nmTokenIdentifier);
List<ContainerId> succeededContainers = new ArrayList<ContainerId>();
Map<ContainerId, SerializedException> failedContainers =
new HashMap<ContainerId, SerializedException>();
for (StartContainerRequest request : requests.getStartContainerRequests()) {
ContainerId containerId = null;
try {
ContainerTokenIdentifier containerTokenIdentifier =
BuilderUtils.newContainerTokenIdentifier(request.getContainerToken());
verifyAndGetContainerTokenIdentifier(request.getContainerToken(),
containerTokenIdentifier);
containerId = containerTokenIdentifier.getContainerID();
LOG.info("DEBUG: startContainers : for for (StartContainerRequest request : requests.getStartContainerRequests()) : containerId: "+containerId);
startContainerInternal(nmTokenIdentifier, containerTokenIdentifier,
request);
succeededContainers.add(containerId);
} catch (YarnException e) {
failedContainers.put(containerId, SerializedException.newInstance(e));
} catch (InvalidToken ie) {
failedContainers.put(containerId, SerializedException.newInstance(ie));
throw ie;
} catch (IOException e) {
throw RPCUtil.getRemoteException(e);
}
}
return StartContainersResponse.newInstance(getAuxServiceMetaData(),
succeededContainers, failedContainers);
}
startContainerInternal --ContainerManagerImpl
@SuppressWarnings("unchecked")
private void startContainerInternal(NMTokenIdentifier nmTokenIdentifier,
ContainerTokenIdentifier containerTokenIdentifier,
StartContainerRequest request) throws YarnException, IOException {
/*
* 1) It should save the NMToken into NMTokenSecretManager. This is done
* here instead of RPC layer because at the time of opening/authenticating
* the connection it doesn't know what all RPC calls user will make on it.
* Also new NMToken is issued only at startContainer (once it gets renewed).
*
* 2) It should validate containerToken. Need to check below things. a) It
* is signed by correct master key (part of retrieve password). b) It
* belongs to correct Node Manager (part of retrieve password). c) It has
* correct RMIdentifier. d) It is not expired.
*/
authorizeStartRequest(nmTokenIdentifier, containerTokenIdentifier);
if (containerTokenIdentifier.getRMIdentifier() != nodeStatusUpdater
.getRMIdentifier()) {
// Is the container coming from unknown RM
StringBuilder sb = new StringBuilder("\nContainer ");
sb.append(containerTokenIdentifier.getContainerID().toString())
.append(" rejected as it is allocated by a previous RM");
throw new InvalidContainerException(sb.toString());
}
// update NMToken
updateNMTokenIdentifier(nmTokenIdentifier);
ContainerId containerId = containerTokenIdentifier.getContainerID();
String containerIdStr = containerId.toString();
String user = containerTokenIdentifier.getApplicationSubmitter();
LOG.info("Start request for " + containerIdStr + " by user " + user);
ContainerLaunchContext launchContext = request.getContainerLaunchContext();
Map<String, ByteBuffer> serviceData = getAuxServiceMetaData();
if (launchContext.getServiceData()!=null &&
!launchContext.getServiceData().isEmpty()) {
for (Map.Entry<String, ByteBuffer> meta : launchContext.getServiceData()
.entrySet()) {
if (null == serviceData.get(meta.getKey())) {
throw new InvalidAuxServiceException("The auxService:" + meta.getKey()
+ " does not exist");
}
}
}
Credentials credentials = parseCredentials(launchContext);
Container container =
new ContainerImpl(getConfig(), this.dispatcher,
context.getNMStateStore(), launchContext,
credentials, metrics, containerTokenIdentifier);
ApplicationId applicationID =
containerId.getApplicationAttemptId().getApplicationId();
if (context.getContainers().putIfAbsent(containerId, container) != null) {
NMAuditLogger.logFailure(user, AuditConstants.START_CONTAINER,
"ContainerManagerImpl", "Container already running on this node!",
applicationID, containerId);
throw RPCUtil.getRemoteException("Container " + containerIdStr
+ " already is running on this node!!");
}
this.readLock.lock();
try {
if (!serviceStopped) {
// Create the application
Application application =
new ApplicationImpl(dispatcher, user, applicationID, credentials, context);
LOG.info("DEBUG: startContainerInternal : Application application =\n" +
" new ApplicationImpl(dispatcher, user, applicationID, credentials, context); appliccation = "
+application.getAppId() + application.getApplicationState() );
if (null == context.getApplications().putIfAbsent(applicationID,
application)) {
LOG.info("Creating a new application reference for app " + applicationID);
LogAggregationContext logAggregationContext =
containerTokenIdentifier.getLogAggregationContext();
Map<ApplicationAccessType, String> appAcls =
container.getLaunchContext().getApplicationACLs();
context.getNMStateStore().storeApplication(applicationID,
buildAppProto(applicationID, user, credentials, appAcls,
logAggregationContext));
dispatcher.getEventHandler().handle(
new ApplicationInitEvent(applicationID, appAcls,
logAggregationContext));
}
LOG.info(" DEBUG : startContainerInternal: this.context.getNMStateStore().storeContainer(containerId, request);containerId" + containerId);
this.context.getNMStateStore().storeContainer(containerId, request);
dispatcher.getEventHandler().handle(
new ApplicationContainerInitEvent(container));
this.context.getContainerTokenSecretManager().startContainerSuccessful(
containerTokenIdentifier);
NMAuditLogger.logSuccess(user, AuditConstants.START_CONTAINER,
"ContainerManageImpl", applicationID, containerId);
// TODO launchedContainer misplaced -> doesn't necessarily mean a container
// launch. A finished Application will not launch containers.
metrics.launchedContainer();
metrics.allocateContainer(containerTokenIdentifier.getResource());
} else {
throw new YarnException(
"Container start failed as the NodeManager is " +
"in the process of shutting down");
}
} finally {
this.readLock.unlock();
}
}
ContainerManagerImpl 的recover方法
关于恢复的流程 : 是NMLeveldbStateStore 先 loadContainerState 的状态 到rcs 中之后 ,再由 recover contaienr 方法去恢复 ,加载的过程其实就是把之前db.put(key,byte[]) 给读出来 ,一个key 对应一个属性一个value 对应一个value值 ,真正容纳这些属性和属性值的是RecoveredContainerState这个对象 ,这些属性被读出来之后将这个对象填充完成,下一步交给ContainerManagerImpl 来recover , rcs 对象中最主要的大对象就是一个StartContainerRequest ,从这个对象可以取出的东西 ,用来new ContainerImpl
@SuppressWarnings("unchecked")
private void recoverContainer(RecoveredContainerState rcs)
throws IOException {
//to give a filter for the recovery of container ,for example ,if the container
// state is completed , we don't recover the container
// Configuration conf = new YarnConfiguration();
// boolean filter = conf.getBoolean(YarnConfiguration.NM_RECOVERY_CONTAINER_FILTER,
// YarnConfiguration.DEFAULT_NM_RECOVERY_CONTAINER_FILTER);
// if (filter) {
// if (rcs.getStatus().equals(NMStateStoreService.RecoveredContainerStatus.COMPLETED) ||
// rcs.getStatus().equals(RecoveredContainerStatus.REQUESTED)) {
// LOG.info(" RecoveredContainerState is " + rcs.getStatus() + " ,do not recover container. return. ");
// return;
// }
// }
StartContainerRequest req = rcs.getStartRequest();
ContainerLaunchContext launchContext = req.getContainerLaunchContext();
ContainerTokenIdentifier token =
BuilderUtils.newContainerTokenIdentifier(req.getContainerToken());
ContainerId containerId = token.getContainerID();
ApplicationId appId =
containerId.getApplicationAttemptId().getApplicationId();
LOG.info("Recovering " + containerId + " in state " + rcs.getStatus()
+ " with exit code " + rcs.getExitCode());
if (context.getApplications().containsKey(appId)) {
//此处经过发现,只要context中还存在container ,即便rm中已经没有app 的信息, 仍然会new 一个containerimpl ,并且 ,这个ContainerImpl的构造器 ,recover用的构造器会先调用正常的this ,然后走到下面 ,也就是两个构造方法都走到
Credentials credentials = parseCredentials(launchContext);
Container container = new ContainerImpl(getConfig(), dispatcher,
context.getNMStateStore(), req.getContainerLaunchContext(),
credentials, metrics, token, rcs.getStatus(), rcs.getExitCode(),
rcs.getDiagnostics(), rcs.getKilled());
context.getContainers().put(containerId, container);
LOG.info("DEBUG: Container container = new ContainerImpl : containerId "+ container.getContainerId());
LOG.info("DEBUG: dispatcher.getEventHandler().handle(\n" +
" new ApplicationContainerInitEvent(container)); is just under this code ,then we go to the event handler");
dispatcher.getEventHandler().handle(
new ApplicationContainerInitEvent(container));
} else {
if (rcs.getStatus() != RecoveredContainerStatus.COMPLETED) {
LOG.warn(containerId + " has no corresponding application!");
}
//???重启rm之后app 已经没有 ,发现nm日志中很多打印这个 ,但是不清楚是因为nm上app没有了,还是rm 上app
//没有记录了 , 会打印这句话
LOG.info("Adding " + containerId + " to recently stopped containers");
nodeStatusUpdater.addCompletedContainer(containerId);
//???这个节点更新信息的这个不知道是是不是nm传给rm的信息 ,
}
}