ExpireLaunchingTasks为JobTracker的后台线程,该线程记录了task的启动时间,每当TaskTracker发送心跳时,JobTracker会分配合适的任务给TaskTracker,并且将TaskAttemptID和启动时间以键值对的形式记录到launchingTasks集合中,待下次心跳如果JT得知该任务启动成功,则会从launchingTasks集合中删除,否则记录失败信息。我们可以把这个数据结构看作待启动任务的集合。
private class ExpireLaunchingTasks implements Runnable {
/**
* This is a map of the tasks that have been assigned to task trackers,
* but that have not yet been seen in a status report.
* map: task-id -> time-assigned
* 注释原文写的很明白了,该集合记录是已经分配,但还没反馈信息的任务
*/
private Map<TaskAttemptID, Long> launchingTasks =
new LinkedHashMap<TaskAttemptID, Long>();
public void run() {
while (true) {
try {
// 大约每三分钟检测一次,相关参数:mapred.tasktracker.expiry.interval
Thread.sleep(TASKTRACKER_EXPIRY_INTERVAL/3);
long now = clock.getTime();
if(LOG.isDebugEnabled()) {
LOG.debug("Starting launching task sweep");
}
synchronized (JobTracker.this) {
synchronized (launchingTasks) {
//遍历任务集合
Iterator<Map.Entry<TaskAttemptID, Long>> itr =
launchingTasks.entrySet().iterator();
while (itr.hasNext()) {
Map.Entry<TaskAttemptID, Long> pair = itr.next();
TaskAttemptID taskId = pair.getKey();
//计算启动耗费时间
long age = now - (pair.getValue()).longValue();
LOG.info(taskId + " is " + age + " ms debug.");
//如果超时,则标记任务失败
if (age > TASKTRACKER_EXPIRY_INTERVAL) {
LOG.info("Launching task " + taskId + " timed out.");
TaskInProgress tip = null;
tip = taskidToTIPMap.get(taskId);
if (tip != null) {
JobInProgress job = tip.getJob();
String trackerName = getAssignedTracker(taskId);
TaskTrackerStatus trackerStatus =
getTaskTrackerStatus(trackerName);
// This might happen when the tasktracker has already
// expired and this thread tries to call failedtask
// again. expire tasktracker should have called failed
// task!
if (trackerStatus != null)
job.failedTask(tip, taskId, "Error launching task",
tip.isMapTask()? TaskStatus.Phase.MAP:
TaskStatus.Phase.STARTING,
TaskStatus.State.FAILED,
trackerName);
}
//从集合中删除
itr.remove();
} else {
// the tasks are sorted by start time, so once we find
// one that we want to keep, we are done for this cycle.
break;
}
}
}
}
} catch (InterruptedException ie) {
// all done
break;
} catch (Exception e) {
LOG.error("Expire Launching Task Thread got exception: " +
StringUtils.stringifyException(e));
}
}
}
//在JT返回心跳时,如果成功分配了任务则将任务信息记录
public void addNewTask(TaskAttemptID taskName) {
synchronized (launchingTasks) {
launchingTasks.put(taskName,
clock.getTime());
}
}
//同上,在任务变为非启动状态后,从集合中删除
public void removeTask(TaskAttemptID taskName) {
synchronized (launchingTasks) {
launchingTasks.remove(taskName);
}
}
}
记录任务启动失败的操作如下:
/**
* Fail a task with a given reason, but without a status object.
*
* Assuming {@link JobTracker} is locked on entry.
*
* @param tip The task's tip
* @param taskid The task id
* @param reason The reason that the task failed
* @param trackerName The task tracker the task failed on
*/
public void failedTask(TaskInProgress tip, TaskAttemptID taskid, String reason,
TaskStatus.Phase phase, TaskStatus.State state,
String trackerName) {
TaskStatus status = TaskStatus.createTaskStatus(tip.isMapTask(),
taskid,
0.0f,
tip.isMapTask() ?
numSlotsPerMap :
numSlotsPerReduce,
state,
reason,
reason,
trackerName, phase,
new Counters());
// update the actual start-time of the attempt
TaskStatus oldStatus = tip.getTaskStatus(taskid);
long startTime = oldStatus == null
? jobtracker.getClock().getTime()
: oldStatus.getStartTime();
status.setStartTime(startTime);
status.setFinishTime(jobtracker.getClock().getTime());
boolean wasComplete = tip.isComplete();
updateTaskStatus(tip, status);
boolean isComplete = tip.isComplete();
if (wasComplete && !isComplete) { // mark a successful tip as failed
String taskType = getTaskType(tip);
JobHistory.Task.logFailed(tip.getTIPId(), taskType,
tip.getExecFinishTime(), reason, taskid);
}
}