在对Yarn上MRAppMaster组件详解以及任务资源申请、启动的源码分析的分析中可以知道,真正用于执行MapTask任务、ReduceTask任务的进程容器为YarnChild进程,接下来对该YarnChild进程运行Task任务进行对应的分析:
TaskAttempt状态机
我们知道具体Map、Reduce任务在YarnChild进程中的调度触发是在TaskAttempt的状态机流转过程中触发实现的,在TaskAttempt的状态机流转过程中:
- New --> UNASSIGNED状态:会触发ContainerAllocator进行container资源的申请
- UNASSIGNED --> ASSIGNED状态:ContainerAllocator将申请到的container资源分配给对应的TaskAttempt
- ASSIGNED --> Running状态:触发ContainerLaunch通过RPC要求NM启动对应的container,也即是启动对应的YarnChild进程执行具体的task任务
首先来看一下TaskAttempt中比较重要的几个对象以及状态转移函数:
.addTransition(TaskAttemptStateInternal.UNASSIGNED,
TaskAttemptStateInternal.ASSIGNED, TaskAttemptEventType.TA_ASSIGNED,
new ContainerAssignedTransition())
private static class ContainerAssignedTransition implements
SingleArcTransition<TaskAttemptImpl, TaskAttemptEvent> {
@SuppressWarnings({ "unchecked" })
@Override
public void transition(final TaskAttemptImpl taskAttempt,
TaskAttemptEvent event) {
final TaskAttemptContainerAssignedEvent cEvent =
(TaskAttemptContainerAssignedEvent) event;
Container container = cEvent.getContainer();
taskAttempt.container = container;
// this is a _real_ Task (classic Hadoop mapred flavor):
// 创建真正用于运行的Task任务
taskAttempt.remoteTask = taskAttempt.createRemoteTask();
// 在jvmID对象中封装对应的jobId、containerId为对应的jobId和jvmId
taskAttempt.jvmID =
new WrappedJvmID(taskAttempt.remoteTask.getTaskID().getJobID(),
taskAttempt.remoteTask.isMapTask(),
taskAttempt.container.getId().getContainerId());
// 将对应的task任务对象以及jvmID存储在taskAttemptListener对象中的jvmIDToActiveAttemptMap中
taskAttempt.taskAttemptListener.registerPendingTask(
taskAttempt.remoteTask, taskAttempt.jvmID);
taskAttempt.computeRackAndLocality();
//launch the container
//create the container object to be launched for a given Task attempt
// 构造ContainerLaunchContext启动上下文,并通知ContainerLaunch去调度ContainerRemoteLaunchEvent事件
// 通知对应的NodeMAnager来启动对应的container任务
ContainerLaunchContext launchContext = createContainerLaunchContext(
cEvent.getApplicationACLs(), taskAttempt.conf, taskAttempt.jobToken,
taskAttempt.remoteTask, taskAttempt.oldJobId, taskAttempt.jvmID,
taskAttempt.taskAttemptListener, taskAttempt.credentials);
taskAttempt.eventHandler
.handle(new ContainerRemoteLaunchEvent(taskAttempt.attemptId,
launchContext, container, taskAttempt.remoteTask));
// send event to speculator that our container needs are satisfied
taskAttempt.eventHandler.handle
(new SpeculatorEvent(taskAttempt.getID().getTaskId(), -1));
}
}
在ContainerAssignedTransition钩子函数中,可以知道其会创建真正用于任务执行的Task对象,并且会将该对象对应映射一个jvmId(jobid、containerId)保存在taskAttemptListener对象中的jvmIDToActiveAttemptMap中,之后便会构造ContainerLaunchContext启动上下文,包括其启动该container的cmd指令;并通知ContainerLaunch去调度ContainerRemoteLaunchEvent事件通知对应的NodeMAnager来启动对应的container任务。
在taskAttemptListener对象中,其主要的作用为:
- 保存对应的jvmID, task之间的映射关系,以及会保存哪些jvmId已经被ContainerLaunch通过rpc在NM上调度执行;
- 接受来自YarnChild进程的任务运行状态信息的汇报,包括更新任务运行心跳、任务运行进度progress、运行阶段phase(Map、shuffle、sort、reduce等)以及counters计数器、map、shuffle、sort等完成时间;
- 监控对应TaskAttempt所触发实际任务的YarnChild的心跳,其会将心跳超时的任务所对应的TaskAttempt状态机触发TaskAttemptEventType.TA_TIMED_OUT超时事件来触发对应container的关闭清理;
public class TaskAttemptListenerImpl extends CompositeService
implements TaskUmbilicalProtocol, TaskAttemptListener {
private Server server; // TaskUmbilicalProtocol协议的rpc server服务端
// YarnChild中的运行任务的心跳超时监控
protected TaskHeartbeatHandler taskHeartbeatHandler;
// 保存所有的jvmId(jobId、containerId)、Task之间的映射关系
private ConcurrentMap<WrappedJvmID, org.apache.hadoop.mapred.Task>
jvmIDToActiveAttemptMap
= new ConcurrentHashMap<WrappedJvmID, org.apache.hadoop.mapred.Task>();
// 保存所有的已经被ContainerLaunch调度启动的jvmId(jobId、containerId)
private Set<WrappedJvmID> launchedJVMs = Collections
.newSetFromMap(new ConcurrentHashMap<WrappedJvmID, Boolean>());
// 启动TaskUmbilicalProtocol协议的rpc server服务端
protected void startRpcServer() {
Configuration conf = getConfig();
try {
server = new RPC.Builder(conf).setProtocol(TaskUmbilicalProtocol.class)
.setInstance(this).setBindAddress().setPort(0).build();
server.start();
this.address = NetUtils.createSocketAddrForHost( // 保存rpc服务端地址
context.getNMHostname(),
server.getListenerAddress().getPort());
} catch (IOException e) {
throw new YarnRuntimeException(e);
}
}
// 运行任务状态的更新,心跳以及任务运行进度progress、
// 运行阶段phase(Map、shuffle、sort、reduce等)以及counters计数器、map、shuffle、sort等完成时间
@Override
public boolean statusUpdate(TaskAttemptID taskAttemptID,
TaskStatus taskStatus) throws IOException, InterruptedException {
org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId yarnAttemptID =
TypeConverter.toYarn(taskAttemptID);
taskHeartbeatHandler.progressing(yarnAttemptID); // 向taskHeartbeatHandler中更新该任务的心跳
TaskAttemptStatus taskAttemptStatus =
new TaskAttemptStatus();
taskAttemptStatus.id = yarnAttemptID;
// Task sends the updated progress to the TT.
taskAttemptStatus.progress = taskStatus.getProgress();
LOG.info("Progress of TaskAttempt " + taskAttemptID + " is : "
+ taskStatus.getProgress());
// Task sends the updated state-string to the TT.
taskAttemptStatus.stateString = taskStatus.getStateString();
// Task sends the updated phase to the TT.
taskAttemptStatus.phase = TypeConverter.toYarn(taskStatus.getPhase());
// Counters are updated by the task. Convert counters into new format as
// that is the primary storage format inside the AM to avoid multiple
// conversions and unnecessary heap usage.
taskAttemptStatus.counters = new org.apache.hadoop.mapreduce.Counters(
taskStatus.getCounters());
// 其他状态的更新
// 调度TaskAttemptStatusUpdateEvent事件更新其状态
context.getEventHandler().handle(
new TaskAttemptStatusUpdateEvent(taskAttemptStatus.id,
taskAttemptStatus));
return true;
}
@Override
public JvmTask getTask(JvmContext context) throws IOException {
// A rough imitation of code from TaskTracker.
JVMId jvmId = context.jvmId;
WrappedJvmID wJvmID = new WrappedJvmID(jvmId.getJobId(), jvmId.isMap,
jvmId.getId());
// 从jvmIDToActiveAttemptMap获取对应的task任务
if (!jvmIDToActiveAttemptMap.containsKey(wJvmID)) {
LOG.info("JVM with ID: " + jvmId + " is invalid and will be killed.");
jvmTask = TASK_FOR_INVALID_JVM;
} else {
if (!launchedJVMs.contains(wJvmID)) {
jvmTask = null;
LOG.info("JVM with ID: " + jvmId
+ " asking for task before AM launch registered. Given null task");
} else {
// remove the task as it is no more needed and free up the memory.
// Also we have already told the JVM to process a task, so it is no
// longer pending, and further request should ask it to exit.
// 根据wJvmID,返回对应的task任务
org.apache.hadoop.mapred.Task task =
jvmIDToActiveAttemptMap.remove(wJvmID);
launchedJVMs.remove(wJvmID);
LOG.info("JVM with ID: " + jvmId + " given task: " + task.getTaskID());
jvmTask = new JvmTask(task, false);
}
}
return jvmTask;
}
@Override
public void registerPendingTask(
org.apache.hadoop.mapred.Task task, WrappedJvmID jvmID) {
// 简单保存对应jvmID, task之间的映射
jvmIDToActiveAttemptMap.put(jvmID, task);
}
@Override
public void registerLaunchedTask(
org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID,
WrappedJvmID jvmId) {
// ContainerLaunch调度启动该container后会将注册该Task任务
// 其会被添加到launchedJVMs中,并且开始对该任务的心跳进行监控
launchedJVMs.add(jvmId);
taskHeartbeatHandler.register(attemptID);
}
}
在taskAttemptListener对象中,可以看到其内部比较重要的对象以及对应的重要的函数方法,其内部会持有一个rpc服务端,用于响应TaskUmbilicalProtocol协议接口,并且其会保存对应的任务以及接受来自yarnchild中task任务运行状态的rpc statusUpdate()汇报并更新其心跳信息。
之后便会通过createContainerLaunchContext()构造ContainerLaunchContext对应的上下文,比较重要的是其所构建的启动具体container的任务命令cmds,在createContainerLaunchContext()方法中,其调用MapReduceChildJVM.getVMCommand()来构造具体的启动指令,其内部比较重要的部分如下:
// Add main class and its arguments
vargs.add(YarnChild.class.getName()); // main of Child
// pass TaskAttemptListener's address
vargs.add(taskAttemptListenerAddr.getAddress().getHostAddress());
vargs.add(Integer.toString(taskAttemptListenerAddr.getPort()));
vargs.add(attemptID.toString()); // pass task identifier
// Finally add the jvmID
vargs.add(String.valueOf(jvmID.getId()));
其会指定对应的运行主类为YarnChild,并且会添加taskAttemptListener对象中运行的TaskUmbilicalProtocol协议的RPC服务端的请求地址addr与端口port、以及对应的attemptID、jvmId等;之后便会通过ContainerLaunchImpl调用rpc协议接口ContainerManagementProtocol.startContainer()与对应的NodeManager通信,以启动一个container,之后便会触发TaskAttempt状态机的转移:
.addTransition(TaskAttemptStateInternal.ASSIGNED, TaskAttemptStateInternal.RUNNING,
TaskAttemptEventType.TA_CONTAINER_LAUNCHED,
new LaunchedContainerTransition())
在触发函数中,其会向TaskAttemptListener进行对应任务的注册,以便启动对其的心跳超时信息监测;
// register it to TaskAttemptListener so that it can start monitoring it.
taskAttempt.taskAttemptListener
.registerLaunchedTask(taskAttempt.attemptId, taskAttempt.jvmID);
YarnChild
我们知道实际应用程序在该container上,最终运行的是yarnchlid进程,其进程内部就是执行具体的mapTask、reduceTask任务。接下来看具体yarnchlid进程的启动以及对于map、reduce任务的具体运行,其yarnchlid#main方法如下:
// YarnChild#main
public static void main(String[] args) throws Throwable {
LOG.debug("Child starting");
// 配置文件job.xml
final JobConf job = new JobConf(MRJobConfig.JOB_CONF_FILE);
// Initing with our JobConf allows us to avoid loading confs twice
Limits.init(job);
// 从args参数中取出TaskUmbilicalProtocol RPC协议接口的 host、port
String host = args[0];
int port = Integer.parseInt(args[1]);
final InetSocketAddress address =
NetUtils.createSocketAddrForHost(host, port);
// 从args参数中取出该container实例运行的任务id以及对应的应用程序id:TaskAttemptID、jobId;
final TaskAttemptID firstTaskid = TaskAttemptID.forName(args[2]);
long jvmIdLong = Long.parseLong(args[3]);
JVMId jvmId = new JVMId(firstTaskid.getJobID(),
firstTaskid.getTaskType() == TaskType.MAP, jvmIdLong);
// ...... metric and token
// Create TaskUmbilicalProtocol as actual task owner.
// 创建实际用于与TaskAttemptImpl状态机内部的taskAttemptListener对象进行任务状态汇报的rpc代理客户端TaskUmbilicalProtocol
final TaskUmbilicalProtocol umbilical =
taskOwner.doAs(new PrivilegedExceptionAction<TaskUmbilicalProtocol>() {
@Override
public TaskUmbilicalProtocol run() throws Exception {
return (TaskUmbilicalProtocol)RPC.getProxy(TaskUmbilicalProtocol.class,
TaskUmbilicalProtocol.versionID, address, job);
}
});
// report non-pid to application master
JvmContext context = new JvmContext(jvmId, "-1000");
LOG.debug("PID: " + System.getenv().get("JVM_PID"));
Task task = null;
UserGroupInformation childUGI = null;
ScheduledExecutorService logSyncer = null;
try {
JvmTask myTask = null;;
// poll for new task
for (int idle = 0; null == myTask; ++idle) {
long sleepTimeMilliSecs = Math.min(idle * 500, 1500);
LOG.info("Sleeping for " + sleepTimeMilliSecs
+ "ms before retrying again. Got null now.");
MILLISECONDS.sleep(sleepTimeMilliSecs);
// 通过rpc接口TaskUmbilicalProtocol 从taskAttemptListener中获取需要执行的task任务
myTask = umbilical.getTask(context);
}
task = myTask.getTask();
YarnChild.taskid = task.getTaskID();
// Create the job-conf and set credentials
// 初始化task的conf配置信息,主要是一些任务运行时的文件目录以及输出文件的名字路径等等
configureTask(job, task, credentials, jt);
// Initiate Java VM metrics
JvmMetrics.initSingleton(jvmId.toString(), job.getSessionId());
childUGI = UserGroupInformation.createRemoteUser(System
.getenv(ApplicationConstants.Environment.USER.toString()));
// Add tokens to new user so that it may execute its task correctly.
childUGI.addCredentials(credentials);
// set job classloader if configured before invoking the task
MRApps.setJobClassLoader(job);
logSyncer = TaskLog.createLogSyncer();
// Create a final reference to the task for the doAs block
// 此处为map、reduce任务运行的实际调用执行处task.run(),其运行时会携带上诉rpc接口的客户端umbilical
// 在运行过程中,将向MRAppMaster汇报其运行的任务进度、状态、阶段等等信息。
final Task taskFinal = task;
childUGI.doAs(new PrivilegedExceptionAction<Object>() {
@Override
public Object run() throws Exception {
// use job-specified working directory
FileSystem.get(job).setWorkingDirectory(job.getWorkingDirectory());
taskFinal.run(job, umbilical); // run the task
return null;
}
});
} catch (FSError e) {
// 执行异常处,调用不同的rpc异常函数向MRAppMaster汇报该异常
// ......
umbilical.fsError(taskid, e.getMessage());
taskFinal.taskCleanup(umbilical);
umbilical.fatalError(taskid, cause);
}
}
在YarnChild#main()函数执行中,首先其会初始化配置任务运行的配置文件job.xml,之后便会从java进程启动命令参数args中获取到taskAttemptListener对象内部的TaskUmbilicalProtocol RPC协议服务端server的 host、port,以及从args参数中取出该container实例运行的任务id以及对应的应用程序id:TaskAttemptID、jobId;之后便会创建与TaskUmbilicalProtocol RPC协议通信的RPC客户端代理umbilical。在代理创建好后,便可以与对应的TaskUmbilicalProtocol RPC Server服务端通信,以便通过rpc方法.getTask(context)根据对应的jvmId(jobId、containerId)获取到对应的Task任务。(对应的任务在TaskAttempt实例分配到container资源并触发其状态机调度TaskAttemptEventType.TA_ASSIGNED事件,触发其转移函数ContainerAssignedTransition中,其将jvmId(jobid、containerId)和task实例保存在taskAttemptListener对象中的jvmIDToActiveAttemptMap中)。在获取到实际需要运行的task任务之后,便可以调用taskFinal.run(job, umbilical)函数运行该实际的任务,并在运行中将其实时的任务状态、进度等信息通过RPC客户端代理umbilical向MRAppMaster汇报其状态进度,并更新在MRAppMaster上任务运行的心跳。
其MapTask、ReduceTask任务的run()方法分别如下:
MapTask:
// MapTask#run
public void run(final JobConf job, final TaskUmbilicalProtocol umbilical)
throws IOException, ClassNotFoundException, InterruptedException {
this.umbilical = umbilical;
// 设置map任务的各个阶段的进度占用配比
// 启动任务状态汇报器,其内部有周期性的汇报线程(状态汇报和心跳)
TaskReporter reporter = startReporter(umbilical);
boolean useNewApi = job.getUseNewMapper();
initialize(job, getJobID(), reporter, useNewApi);
// check if it is a cleanupJobTask
if (useNewApi) { // 新版接口的map函数运行入口
runNewMapper(job, splitMetaInfo, umbilical, reporter);
} else {
runOldMapper(job, splitMetaInfo, umbilical, reporter);
}
done(umbilical, reporter);
}
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewMapper(final JobConf job,
final TaskSplitIndex splitIndex,
final TaskUmbilicalProtocol umbilical,
TaskReporter reporter
) throws IOException, ClassNotFoundException,
InterruptedException {
// map方法中的 任务上下文,构造Mapper类
// 输入的inputFormat,文件的分片split,以及输出的OutputCollector等等
// make a task context so we can get the classes
// .......
try {
input.initialize(split, mapperContext);
// 提供的上层api,用户编写的map函数在此处运行
mapper.run(mapperContext);
// 设置进入下一个阶段,并向MRAppMaster汇报
mapPhase.complete();
setPhase(TaskStatus.Phase.SORT);
statusUpdate(umbilical);
input.close();
input = null;
output.close(mapperContext);
output = null;
} finally {
closeQuietly(input);
closeQuietly(output, mapperContext);
}
}
// Mapper.run()
public void run(Context context) throws IOException, InterruptedException {
setup(context);
try {
while (context.nextKeyValue()) {
// 用户编写的map函数在此处调用执行
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
} finally {
cleanup(context);
}
}
ReduceTask:
public void run(JobConf job, final TaskUmbilicalProtocol umbilical)
throws IOException, InterruptedException, ClassNotFoundException {
job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
// 设置reduce任务的状态过程
// 启动任务状态汇报器,其内部有周期性的汇报线程(状态汇报和心跳)
TaskReporter reporter = startReporter(umbilical);
boolean useNewApi = job.getUseNewReducer();
initialize(job, getJobID(), reporter, useNewApi);
// check if it is a cleanupJobTask
// Initialize the codec
// 设置reduce过程所使用的shuffle插件以及combineCollector
// 执行shuffle过程中的远程数据拉取,在拉取的过程中,
// 其也是通过rpc客户端代理umbilical来向taskAttemptListener对象获取已经完成的MapTask任务
rIter = shuffleConsumerPlugin.run();
// free up the data structures
mapOutputFilesOnDisk.clear();
// 设置进入下一个阶段,并向MRAppMaster汇报
sortPhase.complete(); // sort is complete
setPhase(TaskStatus.Phase.REDUCE);
statusUpdate(umbilical);
if (useNewApi) { // 新版接口的reduce函数运行入口
runNewReducer(job, umbilical, reporter, rIter, comparator,
keyClass, valueClass);
} else {
runOldReducer(job, umbilical, reporter, rIter, comparator,
keyClass, valueClass);
}
shuffleConsumerPlugin.close();
done(umbilical, reporter);
}
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewReducer(JobConf job,
final TaskUmbilicalProtocol umbilical,
final TaskReporter reporter,
RawKeyValueIterator rIter,
RawComparator<INKEY> comparator,
Class<INKEY> keyClass,
Class<INVALUE> valueClass
) throws IOException,InterruptedException,
ClassNotFoundException {
// wrap value iterator to report progress.
// 对shuffle拉取到的中间map结果进行排序,
// 一边排序一边将结果输出给reduce进行执行
try {
// 提供的上层api,用户编写的reduce函数在此处运行
reducer.run(reducerContext);
} finally {
trackedRW.close(reducerContext);
}
}
}
public void run(Context context) throws IOException, InterruptedException {
setup(context);
try {
while (context.nextKey()) {
// 用户编写的reduce函数在此处调用执行
reduce(context.getCurrentKey(), context.getValues(), context);
// If a back up store is used, reset it
Iterator<VALUEIN> iter = context.getValues().iterator();
if(iter instanceof ReduceContext.ValueIterator) {
((ReduceContext.ValueIterator<VALUEIN>)iter).resetBackupStore();
}
}
} finally {
cleanup(context);
}
}