hadoop yarn nodemanager recover 源码分析

存储类的属性

private static final String CONTAINERS_KEY_PREFIX =
    "ContainerManager/containers/";
 private static final String CONTAINER_REQUEST_KEY_SUFFIX = "/request";
  private static final String CONTAINER_DIAGS_KEY_SUFFIX = "/diagnostics";
  private static final String CONTAINER_LAUNCHED_KEY_SUFFIX = "/launched";
  private static final String CONTAINER_KILLED_KEY_SUFFIX = "/killed";
  private static final String CONTAINER_EXIT_CODE_KEY_SUFFIX = "/exitcode";
 private static final String NM_TOKENS_KEY_PREFIX = "NMTokens/";

加载的方法NMLeveldbStateStoreService

loadContainersState – 目的在于返回一个rcs 对象 ,是一个被我们之前在db中存的信息所填充好的

@Override
public List<RecoveredContainerState> loadContainersState()
    throws IOException {
    //新建两个容器 ,一个装恢复的state ,一个装没有startRequest需要被移除的
  ArrayList<RecoveredContainerState> containers =
      new ArrayList<RecoveredContainerState>();
  ArrayList<ContainerId> containersToRemove =
            new ArrayList<ContainerId>();
    //新建迭代器
  LeveldbIterator iter = null;
  try {
    iter = new LeveldbIterator(db);  //db传入
    iter.seek(bytes(CONTAINERS_KEY_PREFIX));  // "ContainerManager/containers/" 的字节

    while (iter.hasNext()) {  //遍历
      Entry<byte[],byte[]> entry = iter.peekNext();   //迭代出一个kv对
      String key = asString(entry.getKey()); //kv对
      if (!key.startsWith(CONTAINERS_KEY_PREFIX)) {  
        break;
      }
      int idEndPos = key.indexOf('/', CONTAINERS_KEY_PREFIX.length());
      if (idEndPos < 0) {
        throw new IOException("Unable to determine container in key: " + key);
      }
        //"ContainerManager/containers/  --应该是从这/ 之后的开始的
      ContainerId containerId = ConverterUtils.toContainerId(
          key.substring(CONTAINERS_KEY_PREFIX.length(), idEndPos));
      String keyPrefix = key.substring(0, idEndPos+1);  //从头截到尾
      RecoveredContainerState rcs = loadContainerState(containerId,
          iter, keyPrefix);   //此处调用单独一个container 的load方法 ,传过去id/迭代器/
      // Don't load container without StartContainerRequest
      if (rcs.startRequest != null) {
        containers.add(rcs);
      } else {
        containersToRemove.add(containerId);
      }
    }
  } catch (DBException e) {
    throw new IOException(e);
  } finally {
    if (iter != null) {
      iter.close();
    }
  }

  // remove container without StartContainerRequest
  for (ContainerId containerId : containersToRemove) {
    LOG.warn("Remove container " + containerId +
        " with incomplete records");
    try {
      removeContainer(containerId);
      // TODO: kill and cleanup the leaked container
    } catch (IOException e) {
      LOG.error("Unable to remove container " + containerId +
          " in store", e);
    }
  }

  return containers;
}

loadContainerState

private RecoveredContainerState loadContainerState(ContainerId containerId,
    LeveldbIterator iter, String keyPrefix) throws IOException {  //传进来id,迭代器,key前缀
  RecoveredContainerState rcs = new RecoveredContainerState();   //只new 了一个对象 ,这个对象下面用迭代的方式将他的属性填充起来
  rcs.status = RecoveredContainerStatus.REQUESTED;  //一开始定义为requested
  while (iter.hasNext()) {  
    Entry<byte[],byte[]> entry = iter.peekNext();
    String key = asString(entry.getKey());
    if (!key.startsWith(keyPrefix)) {
      break;
    }
    iter.next();
    //从迭代器取出的key的字符串, 同一个ocntainerId, 会对应不同的后缀 ,成为不同的key ,可以取出不同     // 的值 ,此处是判断,这个key到底是对应这个container的什么key   
    String suffix = key.substring(keyPrefix.length()-1);  // start with '/'
    if (suffix.equals(CONTAINER_REQUEST_KEY_SUFFIX)) {
      rcs.startRequest = new StartContainerRequestPBImpl(   //第一个是一个大对象
          StartContainerRequestProto.parseFrom(entry.getValue()));
    } else if (suffix.equals(CONTAINER_DIAGS_KEY_SUFFIX)) {
      rcs.diagnostics = asString(entry.getValue());
    } else if (suffix.equals(CONTAINER_LAUNCHED_KEY_SUFFIX)) {  //   /launched
      if (rcs.status == RecoveredContainerStatus.REQUESTED) {
        rcs.status = RecoveredContainerStatus.LAUNCHED;    //status正常都是requested ,如果这个suffix能被读出来,说明status 是launch
      }
    } else if (suffix.equals(CONTAINER_KILLED_KEY_SUFFIX)) {
      rcs.killed = true;    //这个对应之前的 isKilled配合request状态可以得出这个container 
    } else if (suffix.equals(CONTAINER_EXIT_CODE_KEY_SUFFIX)) {
      rcs.status = RecoveredContainerStatus.COMPLETED;   //走到这个suffix也说明是complete
      rcs.exitCode = Integer.parseInt(asString(entry.getValue()));
    } else {
      throw new IOException("Unexpected container state key: " + key);
    }
  }
  return rcs;
}

存储的方法db

storeContainer

@Override
public void storeContainer(ContainerId containerId,
    StartContainerRequest startRequest) throws IOException {
  String key = CONTAINERS_KEY_PREFIX + containerId.toString()
      + CONTAINER_REQUEST_KEY_SUFFIX;
  try {
    db.put(bytes(key),   //也就是存了一个对象StartContainerRequest
      ((StartContainerRequestPBImpl) startRequest).getProto().toByteArray());
  } catch (DBException e) {
    throw new IOException(e);
  }
}

storeContainerDiagnostics

@Override
public void storeContainerDiagnostics(ContainerId containerId,
    StringBuilder diagnostics) throws IOException {
  String key = CONTAINERS_KEY_PREFIX + containerId.toString()
      + CONTAINER_DIAGS_KEY_SUFFIX;
  try {
    db.put(bytes(key), bytes(diagnostics.toString()));
  } catch (DBException e) {
    throw new IOException(e);
  }
}

storeContainerLaunched

@Override
public void storeContainerLaunched(ContainerId containerId)
    throws IOException {
  String key = CONTAINERS_KEY_PREFIX + containerId.toString()
      + CONTAINER_LAUNCHED_KEY_SUFFIX;
  try {
    db.put(bytes(key), EMPTY_VALUE);
      //private static final byte[] EMPTY_VALUE = new byte[0];
  } catch (DBException e) {
    throw new IOException(e);
  }
}

storeContainerKilled

@Override
public void storeContainerKilled(ContainerId containerId)
    throws IOException {
  String key = CONTAINERS_KEY_PREFIX + containerId.toString()
      + CONTAINER_KILLED_KEY_SUFFIX;
  try {
    db.put(bytes(key), EMPTY_VALUE);
  } catch (DBException e) {
    throw new IOException(e);
  }
}

storeContainerCompleted

@Override
public void storeContainerCompleted(ContainerId containerId,
    int exitCode) throws IOException {
  String key = CONTAINERS_KEY_PREFIX + containerId.toString()
      + CONTAINER_EXIT_CODE_KEY_SUFFIX;
  try {
    db.put(bytes(key), bytes(Integer.toString(exitCode)));
  } catch (DBException e) {
    throw new IOException(e);
  }
}

存储的方法在什么时候被调用

storeContainer调用处 --ContainerManagerImpl

@SuppressWarnings("unchecked")
private void startContainerInternal(NMTokenIdentifier nmTokenIdentifier,
    ContainerTokenIdentifier containerTokenIdentifier,
    StartContainerRequest request) throws YarnException, IOException {

  /*
   * 1) It should save the NMToken into NMTokenSecretManager. This is done
   * here instead of RPC layer because at the time of opening/authenticating
   * the connection it doesn't know what all RPC calls user will make on it.
   * Also new NMToken is issued only at startContainer (once it gets renewed).
   * 
   * 2) It should validate containerToken. Need to check below things. a) It
   * is signed by correct master key (part of retrieve password). b) It
   * belongs to correct Node Manager (part of retrieve password). c) It has
   * correct RMIdentifier. d) It is not expired.
   */
  authorizeStartRequest(nmTokenIdentifier, containerTokenIdentifier);

  if (containerTokenIdentifier.getRMIdentifier() != nodeStatusUpdater
      .getRMIdentifier()) {
      // Is the container coming from unknown RM
      StringBuilder sb = new StringBuilder("\nContainer ");
      sb.append(containerTokenIdentifier.getContainerID().toString())
        .append(" rejected as it is allocated by a previous RM");
      throw new InvalidContainerException(sb.toString());
  }
  // update NMToken
  updateNMTokenIdentifier(nmTokenIdentifier);

  ContainerId containerId = containerTokenIdentifier.getContainerID();
  String containerIdStr = containerId.toString();
  String user = containerTokenIdentifier.getApplicationSubmitter();

  LOG.info("Start request for " + containerIdStr + " by user " + user);

  ContainerLaunchContext launchContext = request.getContainerLaunchContext();

  Map<String, ByteBuffer> serviceData = getAuxServiceMetaData();
  if (launchContext.getServiceData()!=null && 
      !launchContext.getServiceData().isEmpty()) {
    for (Map.Entry<String, ByteBuffer> meta : launchContext.getServiceData()
        .entrySet()) {
      if (null == serviceData.get(meta.getKey())) {
        throw new InvalidAuxServiceException("The auxService:" + meta.getKey()
            + " does not exist");
      }
    }
  }

  Credentials credentials = parseCredentials(launchContext);

  Container container =
      new ContainerImpl(getConfig(), this.dispatcher,
          context.getNMStateStore(), launchContext,
        credentials, metrics, containerTokenIdentifier);
  ApplicationId applicationID =
      containerId.getApplicationAttemptId().getApplicationId();
  if (context.getContainers().putIfAbsent(containerId, container) != null) {
    NMAuditLogger.logFailure(user, AuditConstants.START_CONTAINER,
      "ContainerManagerImpl", "Container already running on this node!",
      applicationID, containerId);
    throw RPCUtil.getRemoteException("Container " + containerIdStr
        + " already is running on this node!!");
  }

  this.readLock.lock();
  try {
    if (!serviceStopped) {
      // Create the application
      Application application =
          new ApplicationImpl(dispatcher, user, applicationID, credentials, context);
      if (null == context.getApplications().putIfAbsent(applicationID,
        application)) {
        LOG.info("Creating a new application reference for app " + applicationID);
        LogAggregationContext logAggregationContext =
            containerTokenIdentifier.getLogAggregationContext();
        Map<ApplicationAccessType, String> appAcls =
            container.getLaunchContext().getApplicationACLs();
        context.getNMStateStore().storeApplication(applicationID,
            buildAppProto(applicationID, user, credentials, appAcls,
              logAggregationContext));
        dispatcher.getEventHandler().handle(
          new ApplicationInitEvent(applicationID, appAcls,
            logAggregationContext));
      }

      this.context.getNMStateStore().storeContainer(containerId, request);
      dispatcher.getEventHandler().handle(
        new ApplicationContainerInitEvent(container));

      this.context.getContainerTokenSecretManager().startContainerSuccessful(
        containerTokenIdentifier);
      NMAuditLogger.logSuccess(user, AuditConstants.START_CONTAINER,
        "ContainerManageImpl", applicationID, containerId);
      // TODO launchedContainer misplaced -> doesn't necessarily mean a container
      // launch. A finished Application will not launch containers.
      metrics.launchedContainer();
      metrics.allocateContainer(containerTokenIdentifier.getResource());
    } else {
      throw new YarnException(
          "Container start failed as the NodeManager is " +
          "in the process of shutting down");
    }
  } finally {
    this.readLock.unlock();
  }
}

storeContainerDiagnostics – ContainerImpl

1

private void addDiagnostics(String... diags) {
  for (String s : diags) {
    this.diagnostics.append(s);
  }
  if (diagnostics.length() > diagnosticsMaxSize) {
    LOG.warn("Truncate large diagnostic info, containerId: "
        + containerId + " diagnostic info: " + diagnostics);
    diagnostics.delete(0, diagnostics.length() - diagnosticsMaxSize);
  }
  try {
    stateStore.storeContainerDiagnostics(containerId, diagnostics);
  } catch (IOException e) {
    LOG.warn("Unable to update diagnostics in state store for "
        + containerId, e);
  }
}

2

/**
 * Update diagnostics, staying in the same state.
 */
static class ContainerDiagnosticsUpdateTransition implements
    SingleArcTransition<ContainerImpl, ContainerEvent> {
  @Override
  public void transition(ContainerImpl container, ContainerEvent event) {
    ContainerDiagnosticsUpdateEvent updateEvent =
        (ContainerDiagnosticsUpdateEvent) event;
    container.addDiagnostics(updateEvent.getDiagnosticsUpdate(), "\n");
    try {
      container.stateStore.storeContainerDiagnostics(container.containerId,
          container.diagnostics);
    } catch (IOException e) {
      LOG.warn("Unable to update state store diagnostics for "
          + container.containerId, e);
    }
  }
}

storeContainerLaunched /storeContainerCompleted --ContainerLaunch

@Override
@SuppressWarnings("unchecked") // dispatcher not typed
public Integer call() {
  if(containerType != null && containerType.toLowerCase().equals("docker")) {
    try {
      this.reservedSocket.bind(new InetSocketAddress("127.0.0.1" , 0));
    } catch (IOException e) {
      LOG.error("Can not get available port");
    }
  }
  final ContainerLaunchContext launchContext = container.getLaunchContext();
  Map<Path,List<String>> localResources = null;
  ContainerId containerID = container.getContainerId();
  String containerIdStr = ConverterUtils.toString(containerID);
  final List<String> command = launchContext.getCommands();
  int ret = -1;

  // CONTAINER_KILLED_ON_REQUEST should not be missed if the container
  // is already at KILLING
  if (container.getContainerState() == ContainerState.KILLING) {
    dispatcher.getEventHandler().handle(
        new ContainerExitEvent(containerID,
            ContainerEventType.CONTAINER_KILLED_ON_REQUEST,
            Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() :
                ExitCode.TERMINATED.getExitCode(),
            "Container terminated before launch."));
    return 0;
  }

  try {
    localResources = container.getLocalizedResources();
    if (localResources == null) {
      throw RPCUtil.getRemoteException(
          "Unable to get local resources when Container " + containerID +
          " is at " + container.getContainerState());
    }

    final String user = container.getUser();
    // /// Variable expansion
    // Before the container script gets written out.
    List<String> newCmds = new ArrayList<String>(command.size());
    String appIdStr = app.getAppId().toString();
    String relativeContainerLogDir = ContainerLaunch
        .getRelativeContainerLogDir(appIdStr, containerIdStr);
    Path containerLogDir =
        dirsHandler.getLogPathForWrite(relativeContainerLogDir, false);
    if (taskLogLimitEnable) {
      logdirMonitor = new Thread(new LogdirMonitor(containerLogDir.toString()));
      logdirMonitor.setName("logdirMonitorThread");
      logdirMonitor.start();
    }
    for (String str : command) {
      // TODO: Should we instead work via symlinks without this grammar?
      newCmds.add(expandEnvironment(str, containerLogDir));
    }
    launchContext.setCommands(newCmds);

    Map<String, String> environment = launchContext.getEnvironment();
    // Make a copy of env to iterate & do variable expansion
    for (Entry<String, String> entry : environment.entrySet()) {
      String value = entry.getValue();
      value = expandEnvironment(value, containerLogDir);
      entry.setValue(value);
    }
    // /// End of variable expansion
    if(containerType != null && containerType.toLowerCase().equals("docker")) {
      int port = reservedSocket.getLocalPort();
      LOG.info("Reserved available port: " + port);
      environment.put("DOCKER_PORT", String.valueOf(port));
      String passwd = RandomStringUtils.randomAlphanumeric(6);
      environment.put("DOCKER_PASSWORD", String.valueOf(passwd));
      environment.put("VPC_HOSTNAME", "vpc-" + containerID.toString().substring(containerID.toString().length() - 3));
    }

    FileContext lfs = FileContext.getLocalFSFileContext();

    Path nmPrivateContainerScriptPath =
        dirsHandler.getLocalPathForWrite(
            getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR
                + CONTAINER_SCRIPT);
    Path nmPrivateTokensPath =
        dirsHandler.getLocalPathForWrite(
            getContainerPrivateDir(appIdStr, containerIdStr)
                + Path.SEPARATOR
                + String.format(ContainerLocalizer.TOKEN_FILE_NAME_FMT,
                    containerIdStr));
    Path nmPrivateClasspathJarDir = 
        dirsHandler.getLocalPathForWrite(
            getContainerPrivateDir(appIdStr, containerIdStr));
    DataOutputStream containerScriptOutStream = null;
    DataOutputStream tokensOutStream = null;

    // Select the working directory for the container
    Path containerWorkDir =
        dirsHandler.getLocalPathForWrite(ContainerLocalizer.USERCACHE
            + Path.SEPARATOR + user + Path.SEPARATOR
            + ContainerLocalizer.APPCACHE + Path.SEPARATOR + appIdStr
            + Path.SEPARATOR + containerIdStr,
            LocalDirAllocator.SIZE_UNKNOWN, false);

    String pidFileSubpath = getPidFileSubpath(appIdStr, containerIdStr);

    // pid file should be in nm private dir so that it is not 
    // accessible by users
    pidFilePath = dirsHandler.getLocalPathForWrite(pidFileSubpath);
    List<String> localDirs = dirsHandler.getLocalDirs();
    List<String> logDirs = dirsHandler.getLogDirs();

    Path localPath = new Path(ContainerLocalizer.USERCACHE + Path.SEPARATOR
        + user + Path.SEPARATOR
        + ContainerLocalizer.APPCACHE + Path.SEPARATOR
        + appIdStr + Path.SEPARATOR
        + containerID + Path.SEPARATOR
        + "jobConfDir");
    Path jobConfDir = null;
    try {
      jobConfDir = dirsHandler.getLocalPathToRead(localPath.toString());
    } catch (IOException e) {
      LOG.warn(e.getMessage());
    }
    if (jobConfDir != null) {
      File root = new File(jobConfDir.toString());
      File[] files = root.listFiles();
      if (files != null) {
        for (File file : files) {
          if (file.isDirectory() && file.getName().startsWith("jobconf_")) {
            File[] subFiles = file.listFiles();
            if (subFiles != null) {
              for (File subFile : subFiles) {
                List<String> xmlFileLink = new ArrayList<String>();
                Path xmlFile = new Path(subFile.getPath());
                xmlFileLink.add(subFile.getName());
                localResources.put(xmlFile, xmlFileLink);
              }
            }
          }
        }
      }
    }
    List<String> containerLogDirs = new ArrayList<String>();
    for( String logDir : logDirs) {
      containerLogDirs.add(logDir + Path.SEPARATOR + relativeContainerLogDir);
    }

    if (!dirsHandler.areDisksHealthy()) {
      ret = ContainerExitStatus.DISKS_FAILED;
      throw new IOException("Most of the disks failed. "
          + dirsHandler.getDisksHealthReport(false));
    }

    try {
      // /// Write out the container-script in the nmPrivate space.
      List<Path> appDirs = new ArrayList<Path>(localDirs.size());
      for (String localDir : localDirs) {
        Path usersdir = new Path(localDir, ContainerLocalizer.USERCACHE);
        Path userdir = new Path(usersdir, user);
        Path appsdir = new Path(userdir, ContainerLocalizer.APPCACHE);
        appDirs.add(new Path(appsdir, appIdStr));
      }
      containerScriptOutStream =
        lfs.create(nmPrivateContainerScriptPath,
            EnumSet.of(CREATE, OVERWRITE));

      // Set the token location too.
      environment.put(
          ApplicationConstants.CONTAINER_TOKEN_FILE_ENV_NAME, 
          new Path(containerWorkDir, 
              FINAL_CONTAINER_TOKENS_FILE).toUri().getPath());
      // Sanitize the container's environment
      sanitizeEnv(environment, containerWorkDir, appDirs, containerLogDirs,
        localResources, nmPrivateClasspathJarDir);
      
      // Write out the environment
      exec.writeLaunchEnv(containerScriptOutStream, environment, localResources,
          launchContext.getCommands());
      
      // /// End of writing out container-script

      // /// Write out the container-tokens in the nmPrivate space.
      tokensOutStream =
          lfs.create(nmPrivateTokensPath, EnumSet.of(CREATE, OVERWRITE));
      Credentials creds = container.getCredentials();
      creds.writeTokenStorageToStream(tokensOutStream);
      // /// End of writing out container-tokens
    } finally {
      IOUtils.cleanup(LOG, containerScriptOutStream, tokensOutStream);
    }

    // LaunchContainer is a blocking call. We are here almost means the
    // container is launched, so send out the event.
    dispatcher.getEventHandler().handle(new ContainerEvent(
          containerID,
          ContainerEventType.CONTAINER_LAUNCHED));
    context.getNMStateStore().storeContainerLaunched(containerID);

    // Check if the container is signalled to be killed.
    if (!shouldLaunchContainer.compareAndSet(false, true)) {
      LOG.info("Container " + containerIdStr + " not launched as "
          + "cleanup already called");
      ret = ExitCode.TERMINATED.getExitCode();
    }
    else {
      this.reservedSocket.close();
      exec.activateContainer(containerID, pidFilePath);
      ret = exec.launchContainer(container, nmPrivateContainerScriptPath,
              nmPrivateTokensPath, user, appIdStr, containerWorkDir,
              localDirs, logDirs, taskLogExceeded);
    }
  } catch (Throwable e) {
    LOG.warn("Failed to launch container.", e);
    dirsHandler.activeCheckDirs();
    dispatcher.getEventHandler().handle(new ContainerExitEvent(
        containerID, ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
        e.getMessage()));
    return ret;
  } finally {
    completed.set(true);
    exec.deactivateContainer(containerID);
     stopped = true;
    try {
      if (taskLogLimitEnable && logdirMonitor != null) {
        logdirMonitor.interrupt();
        logdirMonitor.join(THREAD_JOIN_TIMEOUT_MS);
      }
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
    try {
      context.getNMStateStore().storeContainerCompleted(containerID, ret);
    } catch (IOException e) {
      LOG.error("Unable to set exit code for container " + containerID);
    }
  }

  if (LOG.isDebugEnabled()) {
    LOG.debug("Container " + containerIdStr + " completed with exit code "
              + ret);
  }
  if (ret == ExitCode.FORCE_KILLED.getExitCode()
      || ret == ExitCode.TERMINATED.getExitCode()) {
    // If the process was killed, Send container_cleanedup_after_kill and
    // just break out of this method.
    dispatcher.getEventHandler().handle(
          new ContainerExitEvent(containerID,
              ContainerEventType.CONTAINER_KILLED_ON_REQUEST, ret,
              "Container exited with a non-zero exit code " + ret));
    return ret;
  }

  if (ret != 0) {
    dirsHandler.activeCheckDirs();
    LOG.warn("Container exited with a non-zero exit code " + ret);
    this.dispatcher.getEventHandler().handle(new ContainerExitEvent(
        containerID,
        ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
        "Container exited with a non-zero exit code " + ret));
    return ret;
  }

  LOG.info("Container " + containerIdStr + " succeeded ");
  dispatcher.getEventHandler().handle(
      new ContainerEvent(containerID,
          ContainerEventType.CONTAINER_EXITED_WITH_SUCCESS));
  return 0;
}

storeContainerCompleted第二处 --RecoveredContainerLaunch

@SuppressWarnings("unchecked")
@Override
public Integer call() {
  int retCode = ExitCode.LOST.getExitCode();
  ContainerId containerId = container.getContainerId();
  String appIdStr = ConverterUtils.toString(
      containerId.getApplicationAttemptId().getApplicationId());
  String containerIdStr = ConverterUtils.toString(containerId);
  dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
      ContainerEventType.CONTAINER_LAUNCHED));
  boolean notInterrupted = true;
  try {
    File pidFile = locatePidFile(appIdStr, containerIdStr);
    if (pidFile != null) {
      String pidPathStr = pidFile.getPath();
      pidFilePath = new Path(pidPathStr);
      exec.activateContainer(containerId, pidFilePath);
      retCode = exec.reacquireContainer(container.getUser(), containerId);
    } else {
      LOG.warn("Unable to locate pid file for container " + containerIdStr);
    }
  } catch (IOException e) {
      LOG.error("Unable to recover container " + containerIdStr, e);
  } catch (InterruptedException e) {
    LOG.warn("Interrupted while waiting for exit code from " + containerId);
    notInterrupted = false;
  } finally {
    if (notInterrupted) {
      this.completed.set(true);
      exec.deactivateContainer(containerId);
      try {
        getContext().getNMStateStore().storeContainerCompleted(containerId,
            retCode);
      } catch (IOException e) {
        LOG.error("Unable to set exit code for container " + containerId);
      }
    }
  }

  if (retCode != 0) {
    LOG.warn("Recovered container exited with a non-zero exit code "
        + retCode);
    this.dispatcher.getEventHandler().handle(new ContainerExitEvent(
        containerId,
        ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, retCode,
        "Container exited with a non-zero exit code " + retCode));
    return retCode;
  }

  LOG.info("Recovered container " + containerId + " succeeded");
  dispatcher.getEventHandler().handle(
      new ContainerEvent(containerId,
          ContainerEventType.CONTAINER_EXITED_WITH_SUCCESS));
  return 0;
}

storeContainerKilled调用处

1 --ContainerLaunch

 @SuppressWarnings("unchecked") // dispatcher not typed
  public void cleanupContainer() throws IOException {
    ContainerId containerId = container.getContainerId();
    String containerIdStr = ConverterUtils.toString(containerId);
    LOG.info("Cleaning up container " + containerIdStr);

    try {
      context.getNMStateStore().storeContainerKilled(containerId);
    } catch (IOException e) {
      LOG.error("Unable to mark container " + containerId
          + " killed in store", e);
    }

    // launch flag will be set to true if process already launched
    boolean alreadyLaunched = !shouldLaunchContainer.compareAndSet(false, true);
    if (!alreadyLaunched) {
      LOG.info("Container " + containerIdStr + " not launched."
          + " No cleanup needed to be done");
      return;
    }

    LOG.debug("Marking container " + containerIdStr + " as inactive");
    // this should ensure that if the container process has not launched 
    // by this time, it will never be launched
    exec.deactivateContainer(containerId);

    if (LOG.isDebugEnabled()) {
      LOG.debug("Getting pid for container " + containerIdStr + " to kill"
          + " from pid file " 
          + (pidFilePath != null ? pidFilePath.toString() : "null"));
    }
    
    // however the container process may have already started
    try {

      // get process id from pid file if available
      // else if shell is still active, get it from the shell
      String processId = null;
      if (pidFilePath != null) {
        processId = getContainerPid(pidFilePath);
      }

      // kill process
      if (processId != null) {
        String user = container.getUser();
        LOG.debug("Sending signal to pid " + processId
            + " as user " + user
            + " for container " + containerIdStr);

        final Signal signal = sleepDelayBeforeSigKill > 0
          ? Signal.TERM
          : Signal.KILL;

        boolean result = exec.signalContainer(user, processId, signal);

        LOG.debug("Sent signal " + signal + " to pid " + processId
          + " as user " + user
          + " for container " + containerIdStr
          + ", result=" + (result? "success" : "failed"));

        if (sleepDelayBeforeSigKill > 0) {
          new DelayedProcessKiller(container, user,
              processId, sleepDelayBeforeSigKill, Signal.KILL, exec).start();
        }
      }

      if(containerType !=null && containerType.toLowerCase().equals("docker")) {
        LOG.info("docker begin to stop container.");
        String dockerExecutor = container.getLaunchContext().getEnvironment().get(YarnConfiguration.NM_DOCKER_CONTAINER_EXECUTOR_EXEC_NAME);
        if(dockerExecutor == null || dockerExecutor.equals("")) {
          dockerExecutor = conf.get(YarnConfiguration.NM_DOCKER_CONTAINER_EXECUTOR_EXEC_NAME,
                  YarnConfiguration.NM_DEFAULT_DOCKER_CONTAINER_EXECUTOR_EXEC_NAME);
        }
        if (!new File(dockerExecutor).exists()) {
          throw new IllegalStateException("Invalid docker exec path: " + dockerExecutor);
        }
        String command = dockerExecutor + " stop " + containerId.toString();
        LOG.info("command is:" + command);
        Runtime rt = Runtime.getRuntime();
        final Process stopProcess = rt.exec(command);
        int code = stopProcess.waitFor();
        LOG.info("docker stoped container, exit code is " + code);
      }

    } catch (Exception e) {
      String message =
          "Exception when trying to cleanup container " + containerIdStr
              + ": " + StringUtils.stringifyException(e);
      LOG.warn(message);
      dispatcher.getEventHandler().handle(
        new ContainerDiagnosticsUpdateEvent(containerId, message));
    } finally {
      // cleanup pid file if present
      if (pidFilePath != null) {
        FileContext lfs = FileContext.getLocalFSFileContext();
        lfs.delete(pidFilePath, false);
        lfs.delete(pidFilePath.suffix(EXIT_CODE_FILE_SUFFIX), false);
      }
    }
  }

2 – containerManagerImpl

@SuppressWarnings("unchecked")
private void stopContainerInternal(NMTokenIdentifier nmTokenIdentifier,
    ContainerId containerID) throws YarnException, IOException {
  String containerIDStr = containerID.toString();
  Container container = this.context.getContainers().get(containerID);
  LOG.info("Stopping container with container Id: " + containerIDStr);
  authorizeGetAndStopContainerRequest(containerID, container, true,
    nmTokenIdentifier);

  if (container == null) {
    if (!nodeStatusUpdater.isContainerRecentlyStopped(containerID)) {
      throw RPCUtil.getRemoteException("Container " + containerIDStr
        + " is not handled by this NodeManager");
    }
  } else {
    context.getNMStateStore().storeContainerKilled(containerID);
    YarnConfiguration conf = new YarnConfiguration();
    long startTime = System.currentTimeMillis();
    long sleepTime = 0;
    long containerKilledByAmTimeoutInterval = conf
        .getLong(YarnConfiguration.CONTAINER_KILLED_BY_AM_TIMEOUT_MONITOR_INTERVAL_MS,
            YarnConfiguration.DEFAULT_CONTAINER_KILLED_BY_AM_TIMEOUT_MONITOR_INTERVAL_MS);
    long containerKilledByAmTimeout = conf
        .getLong(YarnConfiguration.CONTAINER_KILLED_BY_AM_TIMEOUT_MS,
            YarnConfiguration.DEFAULT_CONTAINER_KILLED_BY_AM_TIMEOUT_MS);
    while (ContainerManagerImpl.this.context.getContainers().get(containerID).getExitCode() == ContainerExitStatus.INVALID) {
      sleepTime = System.currentTimeMillis() - startTime;
      if ( sleepTime > containerKilledByAmTimeout) {
        break;
      }
      try {
        Thread.sleep(containerKilledByAmTimeoutInterval);
      } catch (InterruptedException e) {
        e.printStackTrace();
      }
    }
    LOG.info("container " + containerID
        + " time for Waiting exitCode is " + sleepTime
        + "ms, with containerKilledByAmTimeoutInterval " +  containerKilledByAmTimeoutInterval
        + "ms containerKilledByAmTimeout " + containerKilledByAmTimeout + "ms");
    dispatcher.getEventHandler().handle(
      new ContainerKillEvent(containerID,
          ContainerExitStatus.KILLED_BY_APPMASTER,
          "Container killed by the ApplicationMaster."));

    NMAuditLogger.logSuccess(container.getUser(),    
      AuditConstants.STOP_CONTAINER, "ContainerManageImpl", containerID
        .getApplicationAttemptId().getApplicationId(), containerID);

    // TODO: Move this code to appropriate place once kill_container is
    // implemented.
    nodeStatusUpdater.sendOutofBandHeartBeat();
  }
}

关于launchContainer方法

  1. ContainerManagerImpl 的recoverContainer 在 loadContainer最后

    if (context.getApplications().containsKey(appId)) {
      Credentials credentials = parseCredentials(launchContext);
      Container container = new ContainerImpl(getConfig(), dispatcher,
          context.getNMStateStore(), req.getContainerLaunchContext(),
          credentials, metrics, token, rcs.getStatus(), rcs.getExitCode(),
          rcs.getDiagnostics(), rcs.getKilled());
      context.getContainers().put(containerId, container);
      dispatcher.getEventHandler().handle(  //new ContainerImpl之后发送appInitEvent事件,并且将container传入
          new ApplicationContainerInitEvent(container));
    } else {
      if (rcs.getStatus() != RecoveredContainerStatus.COMPLETED) {
        LOG.warn(containerId + " has no corresponding application!");
      }
      LOG.info("Adding " + containerId + " to recently stopped containers");
      nodeStatusUpdater.addCompletedContainer(containerId);
    }
    
    //关于这个handle 事件 , 
    
    
  2. AppInitContainer事件

    /**  
     * Event sent from {@link ContainerManagerImpl} to {@link ApplicationImpl} to  
     * request the initialization of a container. This is funneled through
     * the Application so that the application life-cycle can be checked, and container
     * launches can be delayed until the application is fully initialized.
     * 
     * Once the application is initialized,
     * {@link ApplicationImpl.InitContainerTransition} simply passes this event on as a
     * {@link ContainerInitEvent}.
     *  
     */
    
    //可以看出ApplicationImpl 会处理这样一个来自于ContainerManagerImpl的事件---从注释 
    //我点进来只看到这个事件的定义 ,但是并没有看到处理的过程 ,处理并不是在所谓的super里 
    //如同上面的dispatcher.getEventHandler.handle(事件)--我现在看见的是事件,真正触发事件的是这个
    //handle ,我点进来事件可以看上上面 注释的谁来handle这个事件,这才是分析的思路 ,ApplicationImpl
    //就是处理者
    public class ApplicationContainerInitEvent extends ApplicationEvent {
      final Container container;
      
      public ApplicationContainerInitEvent(Container container) {
          //此处触发了INIT_CONTAINER事件
        super(container.getContainerId().getApplicationAttemptId()
            .getApplicationId(), ApplicationEventType.INIT_CONTAINER);
        this.container = container;
      }
    
      Container getContainer() {
        return container;
      }
    }
    
  3. ApplicationImpl 重点handle 方法

    @Override
    public void handle(ApplicationEvent event) {
      this.writeLock.lock();
    
      try {
        ApplicationId applicationID = event.getApplicationID();
        LOG.debug("Processing " + applicationID + " of type " + event.getType());
    
        ApplicationState oldState = stateMachine.getCurrentState();
        ApplicationState newState = null;
        try {
          // queue event requesting init of the same app
          newState = stateMachine.doTransition(event.getType(), event);  //init走完
        } catch (InvalidStateTransitonException e) {
          LOG.warn("Can't handle this event at current state", e);
        }
        if (oldState != newState) {  //打印了这句话 /Application application_1551242356703_0003 transitioned from INITING toRUNNING  如果是running状态 ,就会走下面transition中init_done 到running状态
          LOG.info("Application " + applicationID + " transitioned from "
              + oldState + " to " + newState);
        }
      } finally {
        this.writeLock.unlock();
      }
    }
    
  4. 关注一下可以看出ApplicationImpl中 INIT_CONTAINER 事件处理

    private static StateMachineFactory<ApplicationImpl, ApplicationState,
            ApplicationEventType, ApplicationEvent> stateMachineFactory =
        new StateMachineFactory<ApplicationImpl, ApplicationState,
            ApplicationEventType, ApplicationEvent>(ApplicationState.NEW)
    
             // Transitions from NEW state
             .addTransition(ApplicationState.NEW, ApplicationState.INITING,
                 ApplicationEventType.INIT_APPLICATION, new AppInitTransition())
             .addTransition(ApplicationState.NEW, ApplicationState.NEW,
                 ApplicationEventType.INIT_CONTAINER,
                 new InitContainerTransition())  // 其实一般就是看到INIT_CONTAINER这个事件下面调用了什么方法,此处也就是这个new InitContainerTransition
                  // Transitions from RUNNING state
               .addTransition(ApplicationState.RUNNING,
                   ApplicationState.RUNNING,
                   ApplicationEventType.INIT_CONTAINER,
                   new InitContainerTransition())
                
    

    4 关注appImpl中的 new InitContainerTransition

    @SuppressWarnings("unchecked")
    static class InitContainerTransition implements
        SingleArcTransition<ApplicationImpl, ApplicationEvent> {
      @Override
      public void transition(ApplicationImpl app, ApplicationEvent event) {
        ApplicationContainerInitEvent initEvent =
          (ApplicationContainerInitEvent) event;  //event强转成appInitEvent,从子到父再到子
        Container container = initEvent.getContainer(); //传过来的container
        app.containers.put(container.getContainerId(), container);
        LOG.info("Adding " + container.getContainerId()
            + " to application " + app.toString());
        
        switch (app.getApplicationState()) {
        case RUNNING:   //根据日志打印
          // DEBUG:case RUNNING: app.dispatcher.getEventHandler().handle(new ContainerInitEvent(
             //   container.getContainerId())); +containerId= container_1551274324796_0004_01_000053
    
          app.dispatcher.getEventHandler().handle(new ContainerInitEvent(
              container.getContainerId()));
          break;
        case INITING:
        case NEW:
          // these get queued up and sent out in AppInitDoneTransition
          break;
        default:
          assert false : "Invalid state for InitContainerTransition: " +
              app.getApplicationState();
        }
      }
    }
    

    5 ContainerImpl的状态机

    private static StateMachineFactory
             <ContainerImpl, ContainerState, ContainerEventType, ContainerEvent>
          stateMachineFactory =
        new StateMachineFactory<ContainerImpl, ContainerState, ContainerEventType, ContainerEvent>(ContainerState.NEW)
      // From NEW State
      .addTransition(ContainerState.NEW,
          EnumSet.of(ContainerState.LOCALIZING,
              ContainerState.LOCALIZED,
              ContainerState.LOCALIZATION_FAILED,
              ContainerState.DONE),
          ContainerEventType.INIT_CONTAINER, new RequestResourcesTransition())   //找资源初始化事件
    

    6去containerImpl中关注INIT_CONTAINER

    /**
     * State transition when a NEW container receives the INIT_CONTAINER
     * message.
     * 
     * If there are resources to localize, sends a
     * ContainerLocalizationRequest (INIT_CONTAINER_RESOURCES) 
     * to the ResourceLocalizationManager and enters LOCALIZING state.
     * 
     * If there are no resources to localize, sends LAUNCH_CONTAINER event
     * and enters LOCALIZED state directly.
     * 
     * If there are any invalid resources specified, enters LOCALIZATION_FAILED
     * directly.
     */
    @SuppressWarnings("unchecked") // dispatcher not typed
    static class RequestResourcesTransition implements
        MultipleArcTransition<ContainerImpl,ContainerEvent,ContainerState> {
      @Override
      public ContainerState transition(ContainerImpl container,
          ContainerEvent event) {
        if (container.recoveredStatus == RecoveredContainerStatus.COMPLETED) {   //此处开始筛选 completed
          LOG.info("DEBUG: static class RequestResourcesTransitioncontainer.recoveredStatus == RecoveredContainerStatus.COMPLETED ,,,container.sendFinishedEvents();");
          container.sendFinishedEvents();
          return ContainerState.DONE;   //走到此处方法结束
        } else if (container.recoveredAsKilled &&    //被kill + requested
            container.recoveredStatus == RecoveredContainerStatus.REQUESTED) {
          LOG.info("DEBUG:static class RequestResourcesTransitioncontainer.recoveredAsKilled &&\n" +
                  "          container.recoveredStatus == RecoveredContainerStatus.REQUESTED,,,container.metrics.killedContainer();");
          // container was killed but never launched
          container.metrics.killedContainer();
          NMAuditLogger.logSuccess(container.user,
              AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl",
              container.containerId.getApplicationAttemptId().getApplicationId(),
              container.containerId);
          container.metrics.releaseContainer(container.resource);
          container.sendFinishedEvents();
          return ContainerState.DONE;   //走到此处方法结束
        }
    
        final ContainerLaunchContext ctxt = container.launchContext;
        container.metrics.initingContainer();
    
        container.dispatcher.getEventHandler().handle(new AuxServicesEvent
            (AuxServicesEventType.CONTAINER_INIT, container));
    
        // Inform the AuxServices about the opaque serviceData
        Map<String,ByteBuffer> csd = ctxt.getServiceData();
        if (csd != null) {
          // This can happen more than once per Application as each container may
          // have distinct service data
          for (Map.Entry<String,ByteBuffer> service : csd.entrySet()) {
            container.dispatcher.getEventHandler().handle(
                new AuxServicesEvent(AuxServicesEventType.APPLICATION_INIT,
                    container.user, container.containerId
                        .getApplicationAttemptId().getApplicationId(),
                    service.getKey().toString(), service.getValue()));
          }
        }
    
        // Send requests for public, private resources
        Map<String,LocalResource> cntrRsrc = ctxt.getLocalResources();  //被恢复的container 是有这个信息的
        if (!cntrRsrc.isEmpty()) {   //这个是else 对应if  ,如果cntRsrc不是空的
          try {   //会走进来
            for (Map.Entry<String,LocalResource> rsrc : cntrRsrc.entrySet()) {
              try {
                  LocalResourceRequest req;
                  if (rsrc.getValue().getResource() != null) {
                      req = new LocalResourceRequest(rsrc.getValue());
                  } else {
                      continue;
                  }
                List<String> links = container.pendingResources.get(req);
                if (links == null) {
                  links = new ArrayList<String>();
                  container.pendingResources.put(req, links);
                }
                links.add(rsrc.getKey());
                storeSharedCacheUploadPolicy(container, req, rsrc.getValue()
                    .getShouldBeUploadedToSharedCache());
                switch (rsrc.getValue().getVisibility()) {
                case PUBLIC:
                  container.publicRsrcs.add(req);
                  break;
                case PRIVATE:
                  container.privateRsrcs.add(req);
                  break;
                case APPLICATION:
                  container.appRsrcs.add(req);
                  break;
                }
              } catch (URISyntaxException e) {
                LOG.info("Got exception parsing " + rsrc.getKey()
                    + " and value " + rsrc.getValue());
                throw e;
              }
            }
          } catch (URISyntaxException e) {
            // malformed resource; abort container launch
            LOG.warn("Failed to parse resource-request", e);
            container.cleanup();
            container.metrics.endInitingContainer();
            return ContainerState.LOCALIZATION_FAILED;
          }
          Map<LocalResourceVisibility, Collection<LocalResourceRequest>> req =
              new LinkedHashMap<LocalResourceVisibility,
                          Collection<LocalResourceRequest>>();
          if (!container.publicRsrcs.isEmpty()) {
            req.put(LocalResourceVisibility.PUBLIC, container.publicRsrcs);
          }
          if (!container.privateRsrcs.isEmpty()) {
            req.put(LocalResourceVisibility.PRIVATE, container.privateRsrcs);
          }
          if (!container.appRsrcs.isEmpty()) {
            req.put(LocalResourceVisibility.APPLICATION, container.appRsrcs);
          }
          
          container.dispatcher.getEventHandler().handle(
                new ContainerLocalizationRequestEvent(container, req));
          return ContainerState.LOCALIZING;
        } else {  
          container.sendLaunchEvent();  //并没有走到此处
          container.metrics.endInitingContainer();
          return ContainerState.LOCALIZED;
        }
      }
    }
    

重启时调用launchContainer方法的地方

/**
 * Transition when one of the requested resources for this container
 * has been successfully localized.
 */
static class LocalizedTransition implements
    MultipleArcTransition<ContainerImpl,ContainerEvent,ContainerState> {
  @SuppressWarnings("unchecked")
  @Override
  public ContainerState transition(ContainerImpl container,
      ContainerEvent event) {
    LOG.info("DEBUG: We come to LocalizedTransition ");
    ContainerResourceLocalizedEvent rsrcEvent = (ContainerResourceLocalizedEvent) event;
    LocalResourceRequest resourceRequest = rsrcEvent.getResource();
    Path location = rsrcEvent.getLocation();
    List<String> syms = container.pendingResources.remove(resourceRequest);
    if (null == syms) {
      LOG.warn("Localized unknown resource " + resourceRequest +
               " for container " + container.containerId);
      assert false;
      // fail container?
      return ContainerState.LOCALIZING;
    }
    container.localizedResources.put(location, syms);

    // check to see if this resource should be uploaded to the shared cache
    // as well
    if (shouldBeUploadedToSharedCache(container, resourceRequest)) {
      container.resourcesToBeUploaded.put(resourceRequest, location);
    }
    if (!container.pendingResources.isEmpty()) {
      return ContainerState.LOCALIZING;
    }
    LOG.info("DEBUG: LocalizedTransition:  container.dispatcher.getEventHandler().handle(\n" +
            "          new ContainerLocalizationEvent(LocalizationEventType.\n" +
            "              CONTAINER_RESOURCES_LOCALIZED, container));");
    container.dispatcher.getEventHandler().handle(
        new ContainerLocalizationEvent(LocalizationEventType.
            CONTAINER_RESOURCES_LOCALIZED, container));
    //所有本来launch的状态的恢复后会走这里
    LOG.info("DEBUG : LocalizedTransition :  container.sendLaunchEvent(); ContainerId = "+container.getContainerId() +
            "containerNMStatus = " +container.getNMContainerStatus() + " containerState = " +      container.getContainerState());
      
      // LAUNCHED 状态恢复以后会走到的部分
    container.sendLaunchEvent();
    container.metrics.endInitingContainer();

    // If this is a recovered container that has already launched, skip
    // uploading resources to the shared cache. We do this to avoid uploading
    // the same resources multiple times. The tradeoff is that in the case of
    // a recovered container, there is a chance that resources don't get
    // uploaded into the shared cache. This is OK because resources are not
    // acknowledged by the SCM until they have been uploaded by the node
    // manager.
    if (container.recoveredStatus != RecoveredContainerStatus.LAUNCHED
        && container.recoveredStatus != RecoveredContainerStatus.COMPLETED) {
      // kick off uploads to the shared cache
      container.dispatcher.getEventHandler().handle(
          new SharedCacheUploadEvent(container.resourcesToBeUploaded, container
              .getLaunchContext(), container.getUser(),
              SharedCacheUploadEventType.UPLOAD));
    }

    return ContainerState.LOCALIZED;
  }
}
@SuppressWarnings("unchecked") // dispatcher not typed
private void sendLaunchEvent() {
  ContainersLauncherEventType launcherEvent =
      ContainersLauncherEventType.LAUNCH_CONTAINER;
  if (recoveredStatus == RecoveredContainerStatus.LAUNCHED) {
    LOG.info("DEBUG: sendLaunchEvent(),,,recoveredStatus == RecoveredContainerStatus.LAUNCHED containerId = "+ containerId);
    // try to recover a container that was previously launched
    launcherEvent = ContainersLauncherEventType.RECOVER_CONTAINER;
  }
  containerLaunchStartTime = clock.getTime();
  dispatcher.getEventHandler().handle(
      new ContainersLauncherEvent(this, launcherEvent));
}

关于ContainerImpl中sendFinishEvent()

首先看sendFinishEvent

private void sendFinishedEvents() {
  // Inform the application
  @SuppressWarnings("rawtypes")
  EventHandler eventHandler = dispatcher.getEventHandler();
  eventHandler.handle(new ApplicationContainerFinishedEvent(containerId));
  // Remove the container from the resource-monitor
  LOG.info("DEBUG: eventHandler.handle(new ApplicationContainerFinishedEvent(containerId));: sendFinishEvents(): containerId = "+containerId);
  eventHandler.handle(new ContainerStopMonitoringEvent(containerId));
  // Tell the logService too
  eventHandler.handle(new LogHandlerContainerFinishedEvent(
    containerId, exitCode));
}

然后看是何处调用了sendFinishEvent,其实就是状态机中的一小步 ,在recover过程中, 如果发现status的判断是complete ,就会发送

@SuppressWarnings("unchecked") // dispatcher not typed
static class RequestResourcesTransition implements
    MultipleArcTransition<ContainerImpl,ContainerEvent,ContainerState> {
  @Override
  public ContainerState transition(ContainerImpl container,
      ContainerEvent event) {
    if (container.recoveredStatus == RecoveredContainerStatus.COMPLETED) {
      LOG.info("DEBUG: static class RequestResourcesTransitioncontainer.recoveredStatus == RecoveredContainerStatus.COMPLETED ,,,container.sendFinishedEvents();");
      container.sendFinishedEvents();
      return ContainerState.DONE;
    } else if (container.recoveredAsKilled &&
        container.recoveredStatus == RecoveredContainerStatus.REQUESTED) {
      LOG.info("DEBUG:static class RequestResourcesTransitioncontainer.recoveredAsKilled &&\n" +
              "          container.recoveredStatus == RecoveredContainerStatus.REQUESTED,,,container.metrics.killedContainer();");
      // container was killed but never launched
      container.metrics.killedContainer();
      NMAuditLogger.logSuccess(container.user,
          AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl",
          container.containerId.getApplicationAttemptId().getApplicationId(),
          container.containerId);
      container.metrics.releaseContainer(container.resource);
      container.sendFinishedEvents();
      return ContainerState.DONE;
    }

    final ContainerLaunchContext ctxt = container.launchContext;
    container.metrics.initingContainer();

    container.dispatcher.getEventHandler().handle(new AuxServicesEvent
        (AuxServicesEventType.CONTAINER_INIT, container));

    // Inform the AuxServices about the opaque serviceData
    Map<String,ByteBuffer> csd = ctxt.getServiceData();
    if (csd != null) {
      // This can happen more than once per Application as each container may
      // have distinct service data
      for (Map.Entry<String,ByteBuffer> service : csd.entrySet()) {
        container.dispatcher.getEventHandler().handle(
            new AuxServicesEvent(AuxServicesEventType.APPLICATION_INIT,
                container.user, container.containerId
                    .getApplicationAttemptId().getApplicationId(),
                service.getKey().toString(), service.getValue()));
      }
    }

    // Send requests for public, private resources
    Map<String,LocalResource> cntrRsrc = ctxt.getLocalResources();
    if (!cntrRsrc.isEmpty()) {
      try {
        for (Map.Entry<String,LocalResource> rsrc : cntrRsrc.entrySet()) {
          try {
              LocalResourceRequest req;
              if (rsrc.getValue().getResource() != null) {
                  req = new LocalResourceRequest(rsrc.getValue());
              } else {
                  continue;
              }
            List<String> links = container.pendingResources.get(req);
            if (links == null) {
              links = new ArrayList<String>();
              container.pendingResources.put(req, links);
            }
            links.add(rsrc.getKey());
            storeSharedCacheUploadPolicy(container, req, rsrc.getValue()
                .getShouldBeUploadedToSharedCache());
            switch (rsrc.getValue().getVisibility()) {
            case PUBLIC:
              container.publicRsrcs.add(req);
              break;
            case PRIVATE:
              container.privateRsrcs.add(req);
              break;
            case APPLICATION:
              container.appRsrcs.add(req);
              break;
            }
          } catch (URISyntaxException e) {
            LOG.info("Got exception parsing " + rsrc.getKey()
                + " and value " + rsrc.getValue());
            throw e;
          }
        }
      } catch (URISyntaxException e) {
        // malformed resource; abort container launch
        LOG.warn("Failed to parse resource-request", e);
        container.cleanup();
        container.metrics.endInitingContainer();
        return ContainerState.LOCALIZATION_FAILED;
      }
      Map<LocalResourceVisibility, Collection<LocalResourceRequest>> req =
          new LinkedHashMap<LocalResourceVisibility,
                      Collection<LocalResourceRequest>>();
      if (!container.publicRsrcs.isEmpty()) {
        req.put(LocalResourceVisibility.PUBLIC, container.publicRsrcs);
      }
      if (!container.privateRsrcs.isEmpty()) {
        req.put(LocalResourceVisibility.PRIVATE, container.privateRsrcs);
      }
      if (!container.appRsrcs.isEmpty()) {
        req.put(LocalResourceVisibility.APPLICATION, container.appRsrcs);
      }
      
      container.dispatcher.getEventHandler().handle(
            new ContainerLocalizationRequestEvent(container, req));
      return ContainerState.LOCALIZING;
    } else {
      container.sendLaunchEvent();
      container.metrics.endInitingContainer();
      return ContainerState.LOCALIZED;
    }
  }
}

DEBUG过程

storeContainer中

StartRequest对象

container_launch_context { localResources { key: "job.jar" value { resource { scheme: "hdfs" host: "jlnamenode1v.sys.lyct.qihoo.net" port: 9000 file: "/home/yarn/staging_wxc/yarn/.staging/job_1551148219423_0001/job.jar" } size: 3411 timestamp: 1551148802895 type: PATTERN visibility: APPLICATION pattern: "(?:classes/|lib/).*" } } localResources { key: "job.xml" value { resource { scheme: "hdfs" host: "jlnamenode1v.sys.lyct.qihoo.net" port: 9000 file: "/home/yarn/staging_wxc/yarn/.staging/job_1551148219423_0001/job.xml" } size: 108329 timestamp: 1551148804894 type: FILE visibility: APPLICATION } } tokens: "HDTS\000\001\bJobToken\027\026job_1551148219423_0001\024\\f5\n\273;\253\327f\232\231\2678\020X\241i\203@Z\rmapreduce.job\026job_1551148219423_0001\001\025MapReduceShuffleToken\b<\312\036\244\243\235\200\b" service_data { key: "mapreduce_shuffle" value: "\027\026job_1551148219423_0001\b<\312\036\244\243\235\200\b\rmapreduce.job\026job_1551148219423_0001" } environment { key: "HADOOP_CLIENT_OPTS" value: "-server -Xms5120m -Xmx5120m -XX:PermSize=1g -XX:MaxPermSize=1g " } environment { key: "SHELL" value: "/bin/bash" } environment { key: "HADOOP_CLASSPATH" value: "$PWD:job.jar/job.jar:job.jar/classes/:job.jar/lib/*:$PWD/*" } environment { key: "CLASSPATH" value: "$PWD:$HADOOP_SPINNER_CORE_DIR:$HADOOP_CONF_DIR:$HADOOP_COMMON_HOME/share/hadoop/common/*:$HADOOP_COMMON_HOME/share/hadoop/common/lib/*:$HADOOP_HDFS_HOME/share/hadoop/hdfs/*:$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*:$HADOOP_YARN_HOME/share/hadoop/yarn/*:$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*:job.jar/job.jar:job.jar/classes/:job.jar/lib/*:$PWD/*" } environment { key: "HADOOP_ROOT_LOGGER" value: "INFO,console" } environment { key: "LD_LIBRARY_PATH" value: "$PWD:/home/yarn/software/hadoop/lib/native" } environment { key: "STDOUT_LOGFILE_ENV" value: "<LOG_DIR>/stdout" } environment { key: "STDERR_LOGFILE_ENV" value: "<LOG_DIR>/stderr" } command: "$JAVA_HOME/bin/java -Djava.net.preferIPv4Stack=true -Dhadoop.metrics.log.level=WARN  -Xmx1024m -Djava.io.tmpdir=$PWD/tmp -Dlog4j.configuration=container-log4j.properties -Dyarn.app.container.log.dir=<LOG_DIR> -Dyarn.app.container.log.filesize=0 -Dhadoop.root.logger=INFO,CLA -Dhadoop.root.logfile=syslog org.apache.hadoop.mapred.YarnChild 10.198.66.225 36018 attempt_1551148219423_0001_m_000013_1000 15 1><LOG_DIR>/stdout 2><LOG_DIR>/stderr " application_ACLs { accessType: APPACCESS_MODIFY_APP acl: " " } application_ACLs { accessType: APPACCESS_VIEW_APP acl: " " } } container_token { identifier: "\n\021\022\r\n\t\b\001\020\237\370\215\275\222-\020\002\030\017\022\02210.198.66.225:8842\032\004yarn\"\a\b\200\f\020\001\030\000(\304\316\346\275\222-0\303\365\357\"8\237\370\215\275\222-B\002\b\024H\221\367\301\275\222-" password: "\ts\261\277\310i\356\301\v\245\234\311\345G\223\017\f\250\345U" kind: "ContainerToken" service: "10.198.66.225:8842" }
AMLauncher -> startContainerRequest -> STARTContainerInternal -> storeContainer

打印日志storeContainer

2019-02-26 16:05:11,902 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl: Start request for container_1551155033308_0002_01_000002 by user yarn
2019-02-26 16:05:11,903 INFO org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService: DEBUG: storecontainer : KEY= ContainerManager/containers/container_1551155033308_0002_01_000002/request
2019-02-26 16:05:11,903 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl: Adding container_1551155033308_0002_01_000002 to application application_1551155033308_0002
2019-02-26 16:05:11,903 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl: Container container_1551155033308_0002_01_000002 transitioned from NEW to LOCALIZING
2019-02-26 16:05:11,904 INFO org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger: USER=yarn IP=10.198.66.225        OPERATION=Start Container Request       TARGET=ContainerManageImpl   RESULT=SUCCESS  APPID=application_1551155033308_0002    CONTAINERID=container_1551155033308_0002_01_000002
2019-02-26 16:05:12,749 INFO org.apache.spark.network.yarn.YarnShuffleService: Initializing container container_1551155033308_0002_01_000002
2019-02-26 16:05:13,422 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService: Unpack or rename file from /data01/yarn/nm-local-dir/usercache/yarn/appcache/application_1551155033308_0002/container_1551155033308_0002_01_000002/jobConfDir/tmp_job.xml to file:/data01/yarn/nm-local-dir/usercache/yarn/appcache/application_1551155033308_0002/container_1551155033308_0002_01_000002/jobConfDir/jobconf_job.xml
2019-02-26 16:05:13,422 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService: Rename /data01/yarn/nm-local-dir/usercache/yarn/appcache/application_1551155033308_0002/container_1551155033308_0002_01_000002/jobConfDir/tmp_job.xml to /data01/yarn/nm-local-dir/usercache/yarn/appcache/application_1551155033308_0002/container_1551155033308_0002_01_000002/jobConfDir/jobconf_job.xml
2019-02-26 16:05:13,469 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl: Container container_1551155033308_0002_01_000002 transitioned from LOCALIZING to LOCALIZED
2019-02-26 16:05:13,514 INFO org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService:  DEBUG: storeContainerLaunched : KEY = ContainerManager/containers/container_1551155033308_0002_01_000002/launched
2019-02-26 16:05:13,519 INFO org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor: launchContainer: [bash, /data01/yarn/nm-local-dir/usercache/yarn/appcache/application_1551155033308_0002/container_1551155033308_0002_01_000002/default_container_executor.sh]
2019-02-26 16:05:17,482 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl: Stopping container with container Id: container_1551155033308_0002_01_000002
2019-02-26 16:05:17,483 INFO org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService:  DEBUG: storeContainerKilled : KEY = ContainerManager/containers/container_1551155033308_0002_01_000002/killed
2019-02-26 16:05:17,590 INFO org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService:  DEBUG: storeContainerCompleted : KEY = ContainerManager/containers/container_1551155033308_0002_01_000002/exitcode
2019-02-26 16:05:17,591 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch: Container container_1551155033308_0002_01_000002 succeeded
2019-02-26 16:05:20,504 INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl: container container_1551155033308_0002_01_000002 time for Waiting exitCode is 3021ms, with containerKilledByAmTimeoutInterval 500ms containerKilledByAmTimeout 3000ms
2019-02-26 16:05:20,504 INFO org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger: USER=yarn IP=10.198.66.225        OPERATION=Stop Container Request        TARGET=ContainerManageImpl   RESULT=SUCCESS  APPID=application_1551155033308_0002    CONTAINERID=container_1551155033308_0002_01_000002



关于AM可以远程调用的StartContainers

startContainers – ContainerManagerImpl–此方法中for循环调用

/**
 * Start a list of containers on this NodeManager.
 */
@Override
public StartContainersResponse
    startContainers(StartContainersRequest requests) throws YarnException,
        IOException {
  if (blockNewContainerRequests.get()) {
    throw new NMNotYetReadyException(
      "Rejecting new containers as NodeManager has not"
          + " yet connected with ResourceManager");
  }
  UserGroupInformation remoteUgi = getRemoteUgi();
  NMTokenIdentifier nmTokenIdentifier = selectNMTokenIdentifier(remoteUgi);
  authorizeUser(remoteUgi,nmTokenIdentifier);
  List<ContainerId> succeededContainers = new ArrayList<ContainerId>();
  Map<ContainerId, SerializedException> failedContainers =
      new HashMap<ContainerId, SerializedException>();
  for (StartContainerRequest request : requests.getStartContainerRequests()) {
    ContainerId containerId = null;
    try {
      ContainerTokenIdentifier containerTokenIdentifier =
          BuilderUtils.newContainerTokenIdentifier(request.getContainerToken());
      verifyAndGetContainerTokenIdentifier(request.getContainerToken(),
        containerTokenIdentifier);
      containerId = containerTokenIdentifier.getContainerID();
      LOG.info("DEBUG: startContainers : for for (StartContainerRequest request : requests.getStartContainerRequests())  : containerId: "+containerId);
      startContainerInternal(nmTokenIdentifier, containerTokenIdentifier,
        request);
      succeededContainers.add(containerId);
    } catch (YarnException e) {
      failedContainers.put(containerId, SerializedException.newInstance(e));
    } catch (InvalidToken ie) {
      failedContainers.put(containerId, SerializedException.newInstance(ie));
      throw ie;
    } catch (IOException e) {
      throw RPCUtil.getRemoteException(e);
    }
  }

  return StartContainersResponse.newInstance(getAuxServiceMetaData(),
    succeededContainers, failedContainers);
}

startContainerInternal --ContainerManagerImpl

@SuppressWarnings("unchecked")
private void startContainerInternal(NMTokenIdentifier nmTokenIdentifier,
    ContainerTokenIdentifier containerTokenIdentifier,
    StartContainerRequest request) throws YarnException, IOException {

  /*
   * 1) It should save the NMToken into NMTokenSecretManager. This is done
   * here instead of RPC layer because at the time of opening/authenticating
   * the connection it doesn't know what all RPC calls user will make on it.
   * Also new NMToken is issued only at startContainer (once it gets renewed).
   * 
   * 2) It should validate containerToken. Need to check below things. a) It
   * is signed by correct master key (part of retrieve password). b) It
   * belongs to correct Node Manager (part of retrieve password). c) It has
   * correct RMIdentifier. d) It is not expired.
   */
  authorizeStartRequest(nmTokenIdentifier, containerTokenIdentifier);

  if (containerTokenIdentifier.getRMIdentifier() != nodeStatusUpdater
      .getRMIdentifier()) {
      // Is the container coming from unknown RM
      StringBuilder sb = new StringBuilder("\nContainer ");
      sb.append(containerTokenIdentifier.getContainerID().toString())
        .append(" rejected as it is allocated by a previous RM");
      throw new InvalidContainerException(sb.toString());
  }
  // update NMToken
  updateNMTokenIdentifier(nmTokenIdentifier);

  ContainerId containerId = containerTokenIdentifier.getContainerID();
  String containerIdStr = containerId.toString();
  String user = containerTokenIdentifier.getApplicationSubmitter();

  LOG.info("Start request for " + containerIdStr + " by user " + user);

  ContainerLaunchContext launchContext = request.getContainerLaunchContext();

  Map<String, ByteBuffer> serviceData = getAuxServiceMetaData();
  if (launchContext.getServiceData()!=null && 
      !launchContext.getServiceData().isEmpty()) {
    for (Map.Entry<String, ByteBuffer> meta : launchContext.getServiceData()
        .entrySet()) {
      if (null == serviceData.get(meta.getKey())) {
        throw new InvalidAuxServiceException("The auxService:" + meta.getKey()
            + " does not exist"); 
      }
    }
  }

  Credentials credentials = parseCredentials(launchContext);

  Container container =
      new ContainerImpl(getConfig(), this.dispatcher,
          context.getNMStateStore(), launchContext,
        credentials, metrics, containerTokenIdentifier);
  ApplicationId applicationID =
      containerId.getApplicationAttemptId().getApplicationId();
  if (context.getContainers().putIfAbsent(containerId, container) != null) {
    NMAuditLogger.logFailure(user, AuditConstants.START_CONTAINER,
      "ContainerManagerImpl", "Container already running on this node!",
      applicationID, containerId);
    throw RPCUtil.getRemoteException("Container " + containerIdStr
        + " already is running on this node!!");
  }

  this.readLock.lock();
  try {
    if (!serviceStopped) {
      // Create the application
      Application application =
          new ApplicationImpl(dispatcher, user, applicationID, credentials, context);
      LOG.info("DEBUG: startContainerInternal : Application application =\n" +
              "            new ApplicationImpl(dispatcher, user, applicationID, credentials, context); appliccation = "
              +application.getAppId() + application.getApplicationState()  );
      if (null == context.getApplications().putIfAbsent(applicationID,
        application)) {
        LOG.info("Creating a new application reference for app " + applicationID);
        LogAggregationContext logAggregationContext =
            containerTokenIdentifier.getLogAggregationContext();
        Map<ApplicationAccessType, String> appAcls =
            container.getLaunchContext().getApplicationACLs();
        context.getNMStateStore().storeApplication(applicationID,
            buildAppProto(applicationID, user, credentials, appAcls,
              logAggregationContext));
        dispatcher.getEventHandler().handle(
          new ApplicationInitEvent(applicationID, appAcls,
            logAggregationContext));
      }
      LOG.info(" DEBUG : startContainerInternal:  this.context.getNMStateStore().storeContainer(containerId, request);containerId" + containerId);
      this.context.getNMStateStore().storeContainer(containerId, request);
      dispatcher.getEventHandler().handle(
        new ApplicationContainerInitEvent(container));

      this.context.getContainerTokenSecretManager().startContainerSuccessful(
        containerTokenIdentifier);
      NMAuditLogger.logSuccess(user, AuditConstants.START_CONTAINER,
        "ContainerManageImpl", applicationID, containerId);
      // TODO launchedContainer misplaced -> doesn't necessarily mean a container
      // launch. A finished Application will not launch containers.
      metrics.launchedContainer();
      metrics.allocateContainer(containerTokenIdentifier.getResource());
    } else {
      throw new YarnException(
          "Container start failed as the NodeManager is " +
          "in the process of shutting down");
    }
  } finally {
    this.readLock.unlock();
  }
}

ContainerManagerImpl 的recover方法

关于恢复的流程 : 是NMLeveldbStateStore 先 loadContainerState 的状态 到rcs 中之后 ,再由 recover contaienr 方法去恢复 ,加载的过程其实就是把之前db.put(key,byte[]) 给读出来 ,一个key 对应一个属性一个value 对应一个value值 ,真正容纳这些属性和属性值的是RecoveredContainerState这个对象 ,这些属性被读出来之后将这个对象填充完成,下一步交给ContainerManagerImpl 来recover , rcs 对象中最主要的大对象就是一个StartContainerRequest ,从这个对象可以取出的东西 ,用来new ContainerImpl

@SuppressWarnings("unchecked")
  private void recoverContainer(RecoveredContainerState rcs)
      throws IOException {
      //to give a filter for the recovery of container ,for example ,if the container
      // state is completed , we don't recover the container
//      Configuration conf = new YarnConfiguration();
//      boolean filter = conf.getBoolean(YarnConfiguration.NM_RECOVERY_CONTAINER_FILTER,
//           YarnConfiguration.DEFAULT_NM_RECOVERY_CONTAINER_FILTER);
//      if (filter) {
//          if (rcs.getStatus().equals(NMStateStoreService.RecoveredContainerStatus.COMPLETED) ||
//                  rcs.getStatus().equals(RecoveredContainerStatus.REQUESTED)) {
//              LOG.info(" RecoveredContainerState is " + rcs.getStatus() + " ,do not recover container. return. ");
//              return;
//          }
//      }
    StartContainerRequest req = rcs.getStartRequest();
    ContainerLaunchContext launchContext = req.getContainerLaunchContext();
    ContainerTokenIdentifier token =
        BuilderUtils.newContainerTokenIdentifier(req.getContainerToken());
    ContainerId containerId = token.getContainerID();
    ApplicationId appId =
        containerId.getApplicationAttemptId().getApplicationId();

    LOG.info("Recovering " + containerId + " in state " + rcs.getStatus()
        + " with exit code " + rcs.getExitCode());

    if (context.getApplications().containsKey(appId)) {
        //此处经过发现,只要context中还存在container ,即便rm中已经没有app 的信息, 仍然会new 一个containerimpl ,并且 ,这个ContainerImpl的构造器 ,recover用的构造器会先调用正常的this ,然后走到下面 ,也就是两个构造方法都走到
      Credentials credentials = parseCredentials(launchContext);
      Container container = new ContainerImpl(getConfig(), dispatcher,
          context.getNMStateStore(), req.getContainerLaunchContext(),
          credentials, metrics, token, rcs.getStatus(), rcs.getExitCode(),
          rcs.getDiagnostics(), rcs.getKilled());
      context.getContainers().put(containerId, container);
      LOG.info("DEBUG:  Container container = new ContainerImpl : containerId "+ container.getContainerId());   
      LOG.info("DEBUG: dispatcher.getEventHandler().handle(\n" +
              "          new ApplicationContainerInitEvent(container)); is just under this code ,then we go to the event handler");
      dispatcher.getEventHandler().handle(
          new ApplicationContainerInitEvent(container));
    } else {
      if (rcs.getStatus() != RecoveredContainerStatus.COMPLETED) {
        LOG.warn(containerId + " has no corresponding application!");
      }
        //???重启rm之后app 已经没有 ,发现nm日志中很多打印这个 ,但是不清楚是因为nm上app没有了,还是rm 上app
        //没有记录了 , 会打印这句话
      LOG.info("Adding " + containerId + " to recently stopped containers");
      nodeStatusUpdater.addCompletedContainer(containerId);
        //???这个节点更新信息的这个不知道是是不是nm传给rm的信息   , 
    }
  }

ContainerManagerImp

service init–>recover方法

service start -->waitForRecoveredContainers

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值