Yarn中map、reduce任务运行容器YarnChild分析

最新推荐文章于 2024-07-01 09:44:52 发布

午后的红茶meton

最新推荐文章于 2024-07-01 09:44:52 发布

阅读量2.1k

点赞数 1

分类专栏： Hadoop分析与理解文章标签： hadoop yarn yarnChild 任务运行容器

本文链接：https://blog.csdn.net/u012151684/article/details/108286594

版权

Hadoop分析与理解专栏收录该内容

40 篇文章 18 订阅

订阅专栏

在对Yarn上MRAppMaster组件详解以及任务资源申请、启动的源码分析的分析中可以知道，真正用于执行MapTask任务、ReduceTask任务的进程容器为YarnChild进程，接下来对该YarnChild进程运行Task任务进行对应的分析：

TaskAttempt状态机

我们知道具体Map、Reduce任务在YarnChild进程中的调度触发是在TaskAttempt的状态机流转过程中触发实现的，在TaskAttempt的状态机流转过程中：

New --> UNASSIGNED状态：会触发ContainerAllocator进行container资源的申请
UNASSIGNED --> ASSIGNED状态：ContainerAllocator将申请到的container资源分配给对应的TaskAttempt
ASSIGNED --> Running状态：触发ContainerLaunch通过RPC要求NM启动对应的container，也即是启动对应的YarnChild进程执行具体的task任务

首先来看一下TaskAttempt中比较重要的几个对象以及状态转移函数：

.addTransition(TaskAttemptStateInternal.UNASSIGNED,
    TaskAttemptStateInternal.ASSIGNED, TaskAttemptEventType.TA_ASSIGNED,
    new ContainerAssignedTransition())

private static class ContainerAssignedTransition implements
    SingleArcTransition<TaskAttemptImpl, TaskAttemptEvent> {
  @SuppressWarnings({ "unchecked" })
  @Override
  public void transition(final TaskAttemptImpl taskAttempt, 
      TaskAttemptEvent event) {
    final TaskAttemptContainerAssignedEvent cEvent = 
      (TaskAttemptContainerAssignedEvent) event;
    Container container = cEvent.getContainer();
    taskAttempt.container = container;
    // this is a _real_ Task (classic Hadoop mapred flavor):
    // 创建真正用于运行的Task任务
    taskAttempt.remoteTask = taskAttempt.createRemoteTask();
    // 在jvmID对象中封装对应的jobId、containerId为对应的jobId和jvmId
    taskAttempt.jvmID =
        new WrappedJvmID(taskAttempt.remoteTask.getTaskID().getJobID(),
            taskAttempt.remoteTask.isMapTask(),
            taskAttempt.container.getId().getContainerId());
    // 将对应的task任务对象以及jvmID存储在taskAttemptListener对象中的jvmIDToActiveAttemptMap中
    taskAttempt.taskAttemptListener.registerPendingTask(
        taskAttempt.remoteTask, taskAttempt.jvmID);

    taskAttempt.computeRackAndLocality();
    
    //launch the container
    //create the container object to be launched for a given Task attempt
    // 构造ContainerLaunchContext启动上下文，并通知ContainerLaunch去调度ContainerRemoteLaunchEvent事件
    // 通知对应的NodeMAnager来启动对应的container任务
    ContainerLaunchContext launchContext = createContainerLaunchContext(
        cEvent.getApplicationACLs(), taskAttempt.conf, taskAttempt.jobToken,
        taskAttempt.remoteTask, taskAttempt.oldJobId, taskAttempt.jvmID,
        taskAttempt.taskAttemptListener, taskAttempt.credentials);
    taskAttempt.eventHandler
      .handle(new ContainerRemoteLaunchEvent(taskAttempt.attemptId,
        launchContext, container, taskAttempt.remoteTask));

    // send event to speculator that our container needs are satisfied
    taskAttempt.eventHandler.handle
        (new SpeculatorEvent(taskAttempt.getID().getTaskId(), -1));
  }
}

在ContainerAssignedTransition钩子函数中，可以知道其会创建真正用于任务执行的Task对象，并且会将该对象对应映射一个jvmId(jobid、containerId)保存在taskAttemptListener对象中的jvmIDToActiveAttemptMap中，之后便会构造ContainerLaunchContext启动上下文，包括其启动该container的cmd指令；并通知ContainerLaunch去调度ContainerRemoteLaunchEvent事件通知对应的NodeMAnager来启动对应的container任务。

在taskAttemptListener对象中，其主要的作用为：

保存对应的jvmID, task之间的映射关系，以及会保存哪些jvmId已经被ContainerLaunch通过rpc在NM上调度执行；
接受来自YarnChild进程的任务运行状态信息的汇报，包括更新任务运行心跳、任务运行进度progress、运行阶段phase(Map、shuffle、sort、reduce等)以及counters计数器、map、shuffle、sort等完成时间；
监控对应TaskAttempt所触发实际任务的YarnChild的心跳，其会将心跳超时的任务所对应的TaskAttempt状态机触发TaskAttemptEventType.TA_TIMED_OUT超时事件来触发对应container的关闭清理；

public class TaskAttemptListenerImpl extends CompositeService 
    implements TaskUmbilicalProtocol, TaskAttemptListener {
  
  private Server server; // TaskUmbilicalProtocol协议的rpc server服务端
  
  // YarnChild中的运行任务的心跳超时监控
  protected TaskHeartbeatHandler taskHeartbeatHandler;
  
  // 保存所有的jvmId(jobId、containerId)、Task之间的映射关系
  private ConcurrentMap<WrappedJvmID, org.apache.hadoop.mapred.Task>
    jvmIDToActiveAttemptMap
      = new ConcurrentHashMap<WrappedJvmID, org.apache.hadoop.mapred.Task>();
      
  // 保存所有的已经被ContainerLaunch调度启动的jvmId(jobId、containerId)
  private Set<WrappedJvmID> launchedJVMs = Collections
      .newSetFromMap(new ConcurrentHashMap<WrappedJvmID, Boolean>()); 
      
  // 启动TaskUmbilicalProtocol协议的rpc server服务端
  protected void startRpcServer() {
    Configuration conf = getConfig();
    try {
      server = new RPC.Builder(conf).setProtocol(TaskUmbilicalProtocol.class)
            .setInstance(this).setBindAddress().setPort(0).build();
      server.start();
      this.address = NetUtils.createSocketAddrForHost( // 保存rpc服务端地址
          context.getNMHostname(),
          server.getListenerAddress().getPort());
    } catch (IOException e) {
      throw new YarnRuntimeException(e);
    }
  }
  
  // 运行任务状态的更新，心跳以及任务运行进度progress、
  // 运行阶段phase(Map、shuffle、sort、reduce等)以及counters计数器、map、shuffle、sort等完成时间
  @Override
  public boolean statusUpdate(TaskAttemptID taskAttemptID,
      TaskStatus taskStatus) throws IOException, InterruptedException {
    org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId yarnAttemptID =
        TypeConverter.toYarn(taskAttemptID);
    taskHeartbeatHandler.progressing(yarnAttemptID); // 向taskHeartbeatHandler中更新该任务的心跳
    TaskAttemptStatus taskAttemptStatus =
        new TaskAttemptStatus();
    taskAttemptStatus.id = yarnAttemptID;
    // Task sends the updated progress to the TT.
    taskAttemptStatus.progress = taskStatus.getProgress();
    LOG.info("Progress of TaskAttempt " + taskAttemptID + " is : "
        + taskStatus.getProgress());
    // Task sends the updated state-string to the TT.
    taskAttemptStatus.stateString = taskStatus.getStateString();
    // Task sends the updated phase to the TT.
    taskAttemptStatus.phase = TypeConverter.toYarn(taskStatus.getPhase());
    // Counters are updated by the task. Convert counters into new format as
    // that is the primary storage format inside the AM to avoid multiple
    // conversions and unnecessary heap usage.
    taskAttemptStatus.counters = new org.apache.hadoop.mapreduce.Counters(
      taskStatus.getCounters());
    // 其他状态的更新
    // 调度TaskAttemptStatusUpdateEvent事件更新其状态
    context.getEventHandler().handle(
        new TaskAttemptStatusUpdateEvent(taskAttemptStatus.id,
            taskAttemptStatus));
    return true;
  }
  
  @Override
  public JvmTask getTask(JvmContext context) throws IOException {
    // A rough imitation of code from TaskTracker.
    JVMId jvmId = context.jvmId;
    WrappedJvmID wJvmID = new WrappedJvmID(jvmId.getJobId(), jvmId.isMap,
        jvmId.getId());
    // 从jvmIDToActiveAttemptMap获取对应的task任务
    if (!jvmIDToActiveAttemptMap.containsKey(wJvmID)) {
      LOG.info("JVM with ID: " + jvmId + " is invalid and will be killed.");
      jvmTask = TASK_FOR_INVALID_JVM;
    } else {
      if (!launchedJVMs.contains(wJvmID)) {
        jvmTask = null;
        LOG.info("JVM with ID: " + jvmId
            + " asking for task before AM launch registered. Given null task");
      } else {
        // remove the task as it is no more needed and free up the memory.
        // Also we have already told the JVM to process a task, so it is no
        // longer pending, and further request should ask it to exit.
        // 根据wJvmID，返回对应的task任务
        org.apache.hadoop.mapred.Task task =
            jvmIDToActiveAttemptMap.remove(wJvmID);
        launchedJVMs.remove(wJvmID);
        LOG.info("JVM with ID: " + jvmId + " given task: " + task.getTaskID());
        jvmTask = new JvmTask(task, false);
      }
    }
    return jvmTask;
  }
  
  @Override
  public void registerPendingTask(
      org.apache.hadoop.mapred.Task task, WrappedJvmID jvmID) {
    // 简单保存对应jvmID, task之间的映射
    jvmIDToActiveAttemptMap.put(jvmID, task);
  }

  @Override
  public void registerLaunchedTask(
      org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID,
      WrappedJvmID jvmId) {
    // ContainerLaunch调度启动该container后会将注册该Task任务
    // 其会被添加到launchedJVMs中，并且开始对该任务的心跳进行监控
    launchedJVMs.add(jvmId);
    taskHeartbeatHandler.register(attemptID);
  }
}

在taskAttemptListener对象中，可以看到其内部比较重要的对象以及对应的重要的函数方法，其内部会持有一个rpc服务端，用于响应TaskUmbilicalProtocol协议接口，并且其会保存对应的任务以及接受来自yarnchild中task任务运行状态的rpc statusUpdate()汇报并更新其心跳信息。

之后便会通过createContainerLaunchContext()构造ContainerLaunchContext对应的上下文，比较重要的是其所构建的启动具体container的任务命令cmds，在createContainerLaunchContext()方法中，其调用MapReduceChildJVM.getVMCommand()来构造具体的启动指令，其内部比较重要的部分如下：

// Add main class and its arguments 
vargs.add(YarnChild.class.getName());  // main of Child
// pass TaskAttemptListener's address
vargs.add(taskAttemptListenerAddr.getAddress().getHostAddress()); 
vargs.add(Integer.toString(taskAttemptListenerAddr.getPort())); 
vargs.add(attemptID.toString());                      // pass task identifier

// Finally add the jvmID
vargs.add(String.valueOf(jvmID.getId()));

其会指定对应的运行主类为YarnChild，并且会添加taskAttemptListener对象中运行的TaskUmbilicalProtocol协议的RPC服务端的请求地址addr与端口port、以及对应的attemptID、jvmId等；之后便会通过ContainerLaunchImpl调用rpc协议接口ContainerManagementProtocol.startContainer()与对应的NodeManager通信，以启动一个container，之后便会触发TaskAttempt状态机的转移：

.addTransition(TaskAttemptStateInternal.ASSIGNED, TaskAttemptStateInternal.RUNNING,
    TaskAttemptEventType.TA_CONTAINER_LAUNCHED,
    new LaunchedContainerTransition())

在触发函数中，其会向TaskAttemptListener进行对应任务的注册，以便启动对其的心跳超时信息监测；

// register it to TaskAttemptListener so that it can start monitoring it.
taskAttempt.taskAttemptListener
  .registerLaunchedTask(taskAttempt.attemptId, taskAttempt.jvmID);

YarnChild

我们知道实际应用程序在该container上，最终运行的是yarnchlid进程，其进程内部就是执行具体的mapTask、reduceTask任务。接下来看具体yarnchlid进程的启动以及对于map、reduce任务的具体运行，其yarnchlid#main方法如下：

// YarnChild#main
public static void main(String[] args) throws Throwable {
  LOG.debug("Child starting");
  
  // 配置文件job.xml
  final JobConf job = new JobConf(MRJobConfig.JOB_CONF_FILE);
  // Initing with our JobConf allows us to avoid loading confs twice
  Limits.init(job);

  // 从args参数中取出TaskUmbilicalProtocol RPC协议接口的 host、port
  String host = args[0];
  int port = Integer.parseInt(args[1]);
  final InetSocketAddress address =
      NetUtils.createSocketAddrForHost(host, port);
      
  // 从args参数中取出该container实例运行的任务id以及对应的应用程序id：TaskAttemptID、jobId;
  final TaskAttemptID firstTaskid = TaskAttemptID.forName(args[2]);
  long jvmIdLong = Long.parseLong(args[3]);
  JVMId jvmId = new JVMId(firstTaskid.getJobID(),
      firstTaskid.getTaskType() == TaskType.MAP, jvmIdLong);

  // ...... metric and token

  // Create TaskUmbilicalProtocol as actual task owner.
  // 创建实际用于与TaskAttemptImpl状态机内部的taskAttemptListener对象进行任务状态汇报的rpc代理客户端TaskUmbilicalProtocol
  final TaskUmbilicalProtocol umbilical =
    taskOwner.doAs(new PrivilegedExceptionAction<TaskUmbilicalProtocol>() {
    @Override
    public TaskUmbilicalProtocol run() throws Exception {
      return (TaskUmbilicalProtocol)RPC.getProxy(TaskUmbilicalProtocol.class,
          TaskUmbilicalProtocol.versionID, address, job);
    }
  });

  // report non-pid to application master
  JvmContext context = new JvmContext(jvmId, "-1000");
  LOG.debug("PID: " + System.getenv().get("JVM_PID"));
  Task task = null;
  UserGroupInformation childUGI = null;
  ScheduledExecutorService logSyncer = null;

  try {
    JvmTask myTask = null;;
    // poll for new task
    for (int idle = 0; null == myTask; ++idle) {
      long sleepTimeMilliSecs = Math.min(idle * 500, 1500);
      LOG.info("Sleeping for " + sleepTimeMilliSecs
          + "ms before retrying again. Got null now.");
      MILLISECONDS.sleep(sleepTimeMilliSecs);
      // 通过rpc接口TaskUmbilicalProtocol 从taskAttemptListener中获取需要执行的task任务
      myTask = umbilical.getTask(context);
    }

    task = myTask.getTask();
    YarnChild.taskid = task.getTaskID();

    // Create the job-conf and set credentials
    // 初始化task的conf配置信息，主要是一些任务运行时的文件目录以及输出文件的名字路径等等
    configureTask(job, task, credentials, jt);

    // Initiate Java VM metrics
    JvmMetrics.initSingleton(jvmId.toString(), job.getSessionId());
    childUGI = UserGroupInformation.createRemoteUser(System
        .getenv(ApplicationConstants.Environment.USER.toString()));
    // Add tokens to new user so that it may execute its task correctly.
    childUGI.addCredentials(credentials);

    // set job classloader if configured before invoking the task
    MRApps.setJobClassLoader(job);

    logSyncer = TaskLog.createLogSyncer();

    // Create a final reference to the task for the doAs block
    // 此处为map、reduce任务运行的实际调用执行处task.run()，其运行时会携带上诉rpc接口的客户端umbilical
    // 在运行过程中，将向MRAppMaster汇报其运行的任务进度、状态、阶段等等信息。
    final Task taskFinal = task;
    childUGI.doAs(new PrivilegedExceptionAction<Object>() {
      @Override
      public Object run() throws Exception {
        // use job-specified working directory
        FileSystem.get(job).setWorkingDirectory(job.getWorkingDirectory());
        taskFinal.run(job, umbilical); // run the task
        return null;
      }
    });
  } catch (FSError e) {
    // 执行异常处，调用不同的rpc异常函数向MRAppMaster汇报该异常
    // ......
      umbilical.fsError(taskid, e.getMessage());
      taskFinal.taskCleanup(umbilical);
      umbilical.fatalError(taskid, cause);
  }
}

在YarnChild#main()函数执行中，首先其会初始化配置任务运行的配置文件job.xml，之后便会从java进程启动命令参数args中获取到taskAttemptListener对象内部的TaskUmbilicalProtocol RPC协议服务端server的 host、port，以及从args参数中取出该container实例运行的任务id以及对应的应用程序id：TaskAttemptID、jobId；之后便会创建与TaskUmbilicalProtocol RPC协议通信的RPC客户端代理umbilical。在代理创建好后，便可以与对应的TaskUmbilicalProtocol RPC Server服务端通信，以便通过rpc方法.getTask(context)根据对应的jvmId(jobId、containerId)获取到对应的Task任务。(对应的任务在TaskAttempt实例分配到container资源并触发其状态机调度TaskAttemptEventType.TA_ASSIGNED事件，触发其转移函数ContainerAssignedTransition中，其将jvmId(jobid、containerId)和task实例保存在taskAttemptListener对象中的jvmIDToActiveAttemptMap中)。在获取到实际需要运行的task任务之后，便可以调用taskFinal.run(job, umbilical)函数运行该实际的任务，并在运行中将其实时的任务状态、进度等信息通过RPC客户端代理umbilical向MRAppMaster汇报其状态进度，并更新在MRAppMaster上任务运行的心跳。

其MapTask、ReduceTask任务的run()方法分别如下：

MapTask：

// MapTask#run
public void run(final JobConf job, final TaskUmbilicalProtocol umbilical)
  throws IOException, ClassNotFoundException, InterruptedException {
  this.umbilical = umbilical;

  // 设置map任务的各个阶段的进度占用配比
  
  // 启动任务状态汇报器，其内部有周期性的汇报线程(状态汇报和心跳)
  TaskReporter reporter = startReporter(umbilical);

  boolean useNewApi = job.getUseNewMapper();
  initialize(job, getJobID(), reporter, useNewApi);

  // check if it is a cleanupJobTask

  if (useNewApi) { // 新版接口的map函数运行入口
    runNewMapper(job, splitMetaInfo, umbilical, reporter);
  } else {
    runOldMapper(job, splitMetaInfo, umbilical, reporter);
  }
  done(umbilical, reporter);
}

private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewMapper(final JobConf job,
                  final TaskSplitIndex splitIndex,
                  final TaskUmbilicalProtocol umbilical,
                  TaskReporter reporter
                  ) throws IOException, ClassNotFoundException,
                           InterruptedException {
  // map方法中的 任务上下文，构造Mapper类
  // 输入的inputFormat，文件的分片split，以及输出的OutputCollector等等
  // make a task context so we can get the classes
  // .......

  try {
    input.initialize(split, mapperContext);
    // 提供的上层api，用户编写的map函数在此处运行
    mapper.run(mapperContext); 
    // 设置进入下一个阶段，并向MRAppMaster汇报
    mapPhase.complete();
    setPhase(TaskStatus.Phase.SORT);
    statusUpdate(umbilical);
    input.close();
    input = null;
    output.close(mapperContext);
    output = null;
  } finally {
    closeQuietly(input);
    closeQuietly(output, mapperContext);
  }
}

// Mapper.run()
public void run(Context context) throws IOException, InterruptedException {
  setup(context);
  try {
    while (context.nextKeyValue()) {
      // 用户编写的map函数在此处调用执行
      map(context.getCurrentKey(), context.getCurrentValue(), context);
    }
  } finally {
    cleanup(context);
  }
}

ReduceTask：

public void run(JobConf job, final TaskUmbilicalProtocol umbilical)
  throws IOException, InterruptedException, ClassNotFoundException {
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());

  // 设置reduce任务的状态过程
  
  // 启动任务状态汇报器，其内部有周期性的汇报线程(状态汇报和心跳)
  TaskReporter reporter = startReporter(umbilical);
  
  boolean useNewApi = job.getUseNewReducer();
  initialize(job, getJobID(), reporter, useNewApi);

  // check if it is a cleanupJobTask
  
  // Initialize the codec
  // 设置reduce过程所使用的shuffle插件以及combineCollector
  // 执行shuffle过程中的远程数据拉取,在拉取的过程中,
  // 其也是通过rpc客户端代理umbilical来向taskAttemptListener对象获取已经完成的MapTask任务
  rIter = shuffleConsumerPlugin.run();

  // free up the data structures
  mapOutputFilesOnDisk.clear();
  
  // 设置进入下一个阶段，并向MRAppMaster汇报
  sortPhase.complete();                         // sort is complete
  setPhase(TaskStatus.Phase.REDUCE); 
  statusUpdate(umbilical);

  if (useNewApi) { // 新版接口的reduce函数运行入口
    runNewReducer(job, umbilical, reporter, rIter, comparator, 
                  keyClass, valueClass);
  } else {
    runOldReducer(job, umbilical, reporter, rIter, comparator, 
                  keyClass, valueClass);
  }

  shuffleConsumerPlugin.close();
  done(umbilical, reporter);
}

private <INKEY,INVALUE,OUTKEY,OUTVALUE>
  void runNewReducer(JobConf job,
                     final TaskUmbilicalProtocol umbilical,
                     final TaskReporter reporter,
                     RawKeyValueIterator rIter,
                     RawComparator<INKEY> comparator,
                     Class<INKEY> keyClass,
                     Class<INVALUE> valueClass
                     ) throws IOException,InterruptedException, 
                              ClassNotFoundException {
    // wrap value iterator to report progress.
    // 对shuffle拉取到的中间map结果进行排序，
    // 一边排序一边将结果输出给reduce进行执行
    
    try {
      // 提供的上层api，用户编写的reduce函数在此处运行
      reducer.run(reducerContext);
    } finally {
      trackedRW.close(reducerContext);
    }
  }
}

public void run(Context context) throws IOException, InterruptedException {
  setup(context);
  try {
    while (context.nextKey()) {
      // 用户编写的reduce函数在此处调用执行
      reduce(context.getCurrentKey(), context.getValues(), context);
      // If a back up store is used, reset it
      Iterator<VALUEIN> iter = context.getValues().iterator();
      if(iter instanceof ReduceContext.ValueIterator) {
        ((ReduceContext.ValueIterator<VALUEIN>)iter).resetBackupStore();        
      }
    }
  } finally {
    cleanup(context);
  }
}