1 TaskTracker main函数
public static void main(String argv[]) throws Exception {
StringUtils.startupShutdownMessage(TaskTracker.class, argv, LOG);
if (argv.length != 0) {
System.out.println("usage: TaskTracker");
System.exit(-1);
}
try {
JobConf conf=new JobConf();
// enable the server to track time spent waiting on locks
ReflectionUtils.setContentionTracing
(conf.getBoolean("tasktracker.contention.tracking", false));
new TaskTracker(conf).run();
} catch (Throwable e) {
LOG.error("Can not start task tracker because "+
StringUtils.stringifyException(e));
System.exit(-1);
}
}
可知主要是new TaskTracker(conf).run();这句代码,创建一个新的TaskTracker并执行该线程。
2 TaskTracker的构造函数
public TaskTracker(JobConf conf) throws IOException {
originalConf = conf;
maxCurrentMapTasks = conf.getInt(
"mapred.tasktracker.map.tasks.maximum", 2);
maxCurrentReduceTasks = conf.getInt(
"mapred.tasktracker.reduce.tasks.maximum", 2);
this.jobTrackAddr = JobTracker.getAddress(conf);
String infoAddr =
NetUtils.getServerAddress(conf,
"tasktracker.http.bindAddress",
"tasktracker.http.port",
"mapred.task.tracker.http.address");
InetSocketAddress infoSocAddr = NetUtils.createSocketAddr(infoAddr);
String httpBindAddress = infoSocAddr.getHostName();
int httpPort = infoSocAddr.getPort();
this.server = new HttpServer("task", httpBindAddress, httpPort,
httpPort == 0, conf);
workerThreads = conf.getInt("tasktracker.http.threads", 40);
this.shuffleServerMetrics = new ShuffleServerMetrics(conf);
server.setThreads(1, workerThreads);
// let the jsp pages get to the task tracker, config, and other relevant
// objects
FileSystem local = FileSystem.getLocal(conf);
this.localDirAllocator = new LocalDirAllocator("mapred.local.dir");
server.setAttribute("task.tracker", this);
server.setAttribute("local.file.system", local);
server.setAttribute("conf", conf);
server.setAttribute("log", LOG);
server.setAttribute("localDirAllocator", localDirAllocator);
server.setAttribute("shuffleServerMetrics", shuffleServerMetrics);
server.addInternalServlet("mapOutput", "/mapOutput", MapOutputServlet.class);
server.addInternalServlet("taskLog", "/tasklog", TaskLogServlet.class);
server.start();
this.httpPort = server.getPort();
checkJettyPort(httpPort);
initialize();
}
该函数的主要功能有两个,一个是创建HttpServer,另外一个是initialize,initialize其实才是完成TaskTracker初始化的主要工作
3 initialize函数
synchronized void initialize() throws IOException {
// use configured nameserver & interface to get local hostname
this.fConf = new JobConf(originalConf);
if (fConf.get("slave.host.name") != null) {
this.localHostname = fConf.get("slave.host.name");
}
if (localHostname == null) {
this.localHostname =
DNS.getDefaultHost
(fConf.get("mapred.tasktracker.dns.interface","default"),
fConf.get("mapred.tasktracker.dns.nameserver","default"));
}
//check local disk
checkLocalDirs(this.fConf.getLocalDirs());
fConf.deleteLocalFiles(SUBDIR);
// Clear out state tables
this.tasks.clear();
this.runningTasks = new LinkedHashMap<TaskAttemptID, TaskInProgress>();
this.runningJobs = new TreeMap<JobID, RunningJob>();
this.mapTotal = 0;
this.reduceTotal = 0;
this.acceptNewTasks = true;
this.status = null;
this.minSpaceStart = this.fConf.getLong("mapred.local.dir.minspacestart", 0L);
this.minSpaceKill = this.fConf.getLong("mapred.local.dir.minspacekill", 0L);
//tweak the probe sample size (make it a function of numCopiers)
probe_sample_size = this.fConf.getInt("mapred.tasktracker.events.batchsize", 500);
Class<? extends TaskTrackerInstrumentation> metricsInst = getInstrumentationClass(fConf);
try {
java.lang.reflect.Constructor<? extends TaskTrackerInstrumentation> c =
metricsInst.getConstructor(new Class[] {TaskTracker.class} );
this.myInstrumentation = c.newInstance(this);
} catch(Exception e) {
//Reflection can throw lots of exceptions -- handle them all by
//falling back on the default.
LOG.error("failed to initialize taskTracker metrics", e);
this.myInstrumentation = new TaskTrackerMetricsInst(this);
}
// bind address
String address =
NetUtils.getServerAddress(fConf,
"mapred.task.tracker.report.bindAddress",
"mapred.task.tracker.report.port",
"mapred.task.tracker.report.address");
InetSocketAddress socAddr = NetUtils.createSocketAddr(address);
String bindAddress = socAddr.getHostName();
int tmpPort = socAddr.getPort();
this.jvmManager = new JvmManager(this);
// Set service-level authorization security policy
if (this.fConf.getBoolean(
ServiceAuthorizationManager.SERVICE_AUTHORIZATION_CONFIG, false)) {
PolicyProvider policyProvider =
(PolicyProvider)(ReflectionUtils.newInstance(
this.fConf.getClass(PolicyProvider.POLICY_PROVIDER_CONFIG,
MapReducePolicyProvider.class, PolicyProvider.class),
this.fConf));
SecurityUtil.setPolicy(new ConfiguredPolicy(this.fConf, policyProvider));
}
// RPC initialization
int max = maxCurrentMapTasks > maxCurrentReduceTasks ?
maxCurrentMapTasks : maxCurrentReduceTasks;
//set the num handlers to max*2 since canCommit may wait for the duration
//of a heartbeat RPC
this.taskReportServer =
RPC.getServer(this, bindAddress, tmpPort, 2 * max, false, this.fConf);
this.taskReportServer.start();
// get the assigned address
this.taskReportAddress = taskReportServer.getListenerAddress();
this.fConf.set("mapred.task.tracker.report.address",
taskReportAddress.getHostName() + ":" + taskReportAddress.getPort());
LOG.info("TaskTracker up at: " + this.taskReportAddress);
this.taskTrackerName = "tracker_" + localHostname + ":" + taskReportAddress;
LOG.info("Starting tracker " + taskTrackerName);
// Clear out temporary files that might be lying around
DistributedCache.purgeCache(this.fConf);
cleanupStorage();
this.jobClient = (InterTrackerProtocol)
RPC.waitForProxy(InterTrackerProtocol.class,
InterTrackerProtocol.versionID,
jobTrackAddr, this.fConf);
this.justInited = true;
this.running = true;
// start the thread that will fetch map task completion events
this.mapEventsFetcher = new MapEventsFetcherThread();
mapEventsFetcher.setDaemon(true);
mapEventsFetcher.setName(
"Map-events fetcher for all reduce tasks " + "on " +
taskTrackerName);
mapEventsFetcher.start();
initializeMemoryManagement();
this.indexCache = new IndexCache(this.fConf);
mapLauncher = new TaskLauncher(maxCurrentMapTasks);
reduceLauncher = new TaskLauncher(maxCurrentReduceTasks);
mapLauncher.start();
reduceLauncher.start();
}
其主要功能是初始化一般属性,创建一个taskReportServer并通过rpc获得一个jobClient引用,创建jobClient用到的地址在构造函数中已经得到初始化 this.jobTrackAddr = JobTracker.getAddress(conf);。还有创建一个mapLauncher和reduceLauncher并启动。
4 mapLauncher.run()
public void run() {
while (!Thread.interrupted()) {
try {
TaskInProgress tip;
synchronized (tasksToLaunch) {
while (tasksToLaunch.isEmpty()) {
tasksToLaunch.wait();
}
//get the TIP
tip = tasksToLaunch.remove(0);
LOG.info("Trying to launch : " + tip.getTask().getTaskID());
}
//wait for a slot to run
synchronized (numFreeSlots) {
while (numFreeSlots.get() == 0) {
numFreeSlots.wait();
}
LOG.info("In TaskLauncher, current free slots : " + numFreeSlots.get()+
" and trying to launch "+tip.getTask().getTaskID());
numFreeSlots.set(numFreeSlots.get() - 1);
assert (numFreeSlots.get() >= 0);
}
synchronized (tip) {
//to make sure that there is no kill task action for this
if (tip.getRunState() != TaskStatus.State.UNASSIGNED &&
tip.getRunState() != TaskStatus.State.FAILED_UNCLEAN &&
tip.getRunState() != TaskStatus.State.KILLED_UNCLEAN) {
//got killed externally while still in the launcher queue
addFreeSlot();
continue;
}
tip.slotTaken = true;
}
//got a free slot. launch the task
startNewTask(tip);
} catch (InterruptedException e) {
return; // ALL DONE
} catch (Throwable th) {
LOG.error("TaskLauncher error " +
StringUtils.stringifyException(th));
}
}
}
它先等待任务的到来,然后等待空闲的slot,等两者兼备时选择合适状态的任务执行startNewTask(tip);
5 startNewTask
private void startNewTask(TaskInProgress tip) {
try {
localizeJob(tip);
} catch (Throwable e) {
String msg = ("Error initializing " + tip.getTask().getTaskID() +
":\n" + StringUtils.stringifyException(e));
LOG.warn(msg);
tip.reportDiagnosticInfo(msg);
try {
tip.kill(true);
tip.cleanup(true);
} catch (IOException ie2) {
LOG.info("Error cleaning up " + tip.getTask().getTaskID() + ":\n" +
StringUtils.stringifyException(ie2));
}
// Careful!
// This might not be an 'Exception' - don't handle 'Error' here!
if (e instanceof Error) {
throw ((Error) e);
}
}
}
直接进入localizeJob
6 localizeJob
private void localizeJob(TaskInProgress tip) throws IOException {
Path localJarFile = null;
Task t = tip.getTask();
JobID jobId = t.getJobID();
Path jobFile = new Path(t.getJobFile());
// Get sizes of JobFile and JarFile
// sizes are -1 if they are not present.
FileStatus status = null;
long jobFileSize = -1;
try {
status = systemFS.getFileStatus(jobFile);
jobFileSize = status.getLen();
} catch(FileNotFoundException fe) {
jobFileSize = -1;
}
Path localJobFile = lDirAlloc.getLocalPathForWrite(
getLocalJobDir(jobId.toString())
+ Path.SEPARATOR + "job.xml",
jobFileSize, fConf);
RunningJob rjob = addTaskToJob(jobId, tip);
synchronized (rjob) {
if (!rjob.localized) {
FileSystem localFs = FileSystem.getLocal(fConf);
// this will happen on a partial execution of localizeJob.
// Sometimes the job.xml gets copied but copying job.jar
// might throw out an exception
// we should clean up and then try again
Path jobDir = localJobFile.getParent();
if (localFs.exists(jobDir)){
localFs.delete(jobDir, true);
boolean b = localFs.mkdirs(jobDir);
if (!b)
throw new IOException("Not able to create job directory "
+ jobDir.toString());
}
systemFS.copyToLocalFile(jobFile, localJobFile);
JobConf localJobConf = new JobConf(localJobFile);
// create the 'work' directory
// job-specific shared directory for use as scratch space
Path workDir = lDirAlloc.getLocalPathForWrite(
(getLocalJobDir(jobId.toString())
+ Path.SEPARATOR + "work"), fConf);
if (!localFs.mkdirs(workDir)) {
throw new IOException("Mkdirs failed to create "
+ workDir.toString());
}
System.setProperty("job.local.dir", workDir.toString());
localJobConf.set("job.local.dir", workDir.toString());
// copy Jar file to the local FS and unjar it.
String jarFile = localJobConf.getJar();
long jarFileSize = -1;
if (jarFile != null) {
Path jarFilePath = new Path(jarFile);
try {
status = systemFS.getFileStatus(jarFilePath);
jarFileSize = status.getLen();
} catch(FileNotFoundException fe) {
jarFileSize = -1;
}
// Here we check for and we check five times the size of jarFileSize
// to accommodate for unjarring the jar file in work directory
localJarFile = new Path(lDirAlloc.getLocalPathForWrite(
getLocalJobDir(jobId.toString())
+ Path.SEPARATOR + "jars",
5 * jarFileSize, fConf), "job.jar");
if (!localFs.mkdirs(localJarFile.getParent())) {
throw new IOException("Mkdirs failed to create jars directory ");
}
systemFS.copyToLocalFile(jarFilePath, localJarFile);
localJobConf.setJar(localJarFile.toString());
OutputStream out = localFs.create(localJobFile);
try {
localJobConf.writeXml(out);
} finally {
out.close();
}
// also unjar the job.jar files
RunJar.unJar(new File(localJarFile.toString()),
new File(localJarFile.getParent().toString()));
}
rjob.keepJobFiles = ((localJobConf.getKeepTaskFilesPattern() != null) ||
localJobConf.getKeepFailedTaskFiles());
rjob.localized = true;
rjob.jobConf = localJobConf;
}
}
launchTaskForJob(tip, new JobConf(rjob.jobConf));
}
前面的主要功能是将作业本地化, systemFS.copyToLocalFile(jobFile, localJobFile);systemFS.copyToLocalFile(jarFilePath, localJarFile);这两句完成程序和数据的拷贝。
最后launchTaskForJob(tip, new JobConf(rjob.jobConf));
7 launchTaskForJob
private void launchTaskForJob(TaskInProgress tip, JobConf jobConf) throws IOException{
synchronized (tip) {
tip.setJobConf(jobConf);
tip.launchTask();
}
}
8 tip.launchTask
public synchronized void launchTask() throws IOException {
if (this.taskStatus.getRunState() == TaskStatus.State.UNASSIGNED ||
this.taskStatus.getRunState() == TaskStatus.State.FAILED_UNCLEAN ||
this.taskStatus.getRunState() == TaskStatus.State.KILLED_UNCLEAN) {
localizeTask(task);
if (this.taskStatus.getRunState() == TaskStatus.State.UNASSIGNED) {
this.taskStatus.setRunState(TaskStatus.State.RUNNING);
}
this.runner = task.createRunner(TaskTracker.this, this);
this.runner.start();
this.taskStatus.setStartTime(System.currentTimeMillis());
} else {
LOG.info("Not launching task: " + task.getTaskID() +
" since it's state is " + this.taskStatus.getRunState());
}
}
9 TaskRunner.run()
public final void run() {
try {
//before preparing the job localize
//all the archives
TaskAttemptID taskid = t.getTaskID();
LocalDirAllocator lDirAlloc = new LocalDirAllocator("mapred.local.dir");
File jobCacheDir = null;
if (conf.getJar() != null) {
jobCacheDir = new File(
new Path(conf.getJar()).getParent().toString());
}
File workDir = new File(lDirAlloc.getLocalPathToRead(
TaskTracker.getLocalTaskDir(
t.getJobID().toString(),
t.getTaskID().toString(),
t.isTaskCleanupTask())
+ Path.SEPARATOR + MRConstants.WORKDIR,
conf). toString());
URI[] archives = DistributedCache.getCacheArchives(conf);
URI[] files = DistributedCache.getCacheFiles(conf);
FileStatus fileStatus;
FileSystem fileSystem;
Path localPath;
String baseDir;
if ((archives != null) || (files != null)) {
if (archives != null) {
String[] archivesTimestamps =
DistributedCache.getArchiveTimestamps(conf);
Path[] p = new Path[archives.length];
for (int i = 0; i < archives.length;i++){
fileSystem = FileSystem.get(archives[i], conf);
fileStatus = fileSystem.getFileStatus(
new Path(archives[i].getPath()));
String cacheId = DistributedCache.makeRelative(archives[i],conf);
String cachePath = TaskTracker.getCacheSubdir() +
Path.SEPARATOR + cacheId;
localPath = lDirAlloc.getLocalPathForWrite(cachePath,
fileStatus.getLen(), conf);
baseDir = localPath.toString().replace(cacheId, "");
p[i] = DistributedCache.getLocalCache(archives[i], conf,
new Path(baseDir),
fileStatus,
true, Long.parseLong(
archivesTimestamps[i]),
new Path(workDir.
getAbsolutePath()),
false);
}
DistributedCache.setLocalArchives(conf, stringifyPathArray(p));
}
if ((files != null)) {
String[] fileTimestamps = DistributedCache.getFileTimestamps(conf);
Path[] p = new Path[files.length];
for (int i = 0; i < files.length;i++){
fileSystem = FileSystem.get(files[i], conf);
fileStatus = fileSystem.getFileStatus(
new Path(files[i].getPath()));
String cacheId = DistributedCache.makeRelative(files[i], conf);
String cachePath = TaskTracker.getCacheSubdir() +
Path.SEPARATOR + cacheId;
localPath = lDirAlloc.getLocalPathForWrite(cachePath,
fileStatus.getLen(), conf);
baseDir = localPath.toString().replace(cacheId, "");
p[i] = DistributedCache.getLocalCache(files[i], conf,
new Path(baseDir),
fileStatus,
false, Long.parseLong(
fileTimestamps[i]),
new Path(workDir.
getAbsolutePath()),
false);
}
DistributedCache.setLocalFiles(conf, stringifyPathArray(p));
}
Path localTaskFile = new Path(t.getJobFile());
FileSystem localFs = FileSystem.getLocal(conf);
localFs.delete(localTaskFile, true);
OutputStream out = localFs.create(localTaskFile);
try {
conf.writeXml(out);
} finally {
out.close();
}
}
if (!prepare()) {
return;
}
String sep = System.getProperty("path.separator");
StringBuffer classPath = new StringBuffer();
// start with same classpath as parent process
classPath.append(System.getProperty("java.class.path"));
classPath.append(sep);
if (!workDir.mkdirs()) {
if (!workDir.isDirectory()) {
LOG.fatal("Mkdirs failed to create " + workDir.toString());
}
}
String jar = conf.getJar();
if (jar != null) {
// if jar exists, it into workDir
File[] libs = new File(jobCacheDir, "lib").listFiles();
if (libs != null) {
for (int i = 0; i < libs.length; i++) {
classPath.append(sep); // add libs from jar to classpath
classPath.append(libs[i]);
}
}
classPath.append(sep);
classPath.append(new File(jobCacheDir, "classes"));
classPath.append(sep);
classPath.append(jobCacheDir);
}
// include the user specified classpath
//archive paths
Path[] archiveClasspaths = DistributedCache.getArchiveClassPaths(conf);
if (archiveClasspaths != null && archives != null) {
Path[] localArchives = DistributedCache
.getLocalCacheArchives(conf);
if (localArchives != null){
for (int i=0;i<archives.length;i++){
for(int j=0;j<archiveClasspaths.length;j++){
if (archives[i].getPath().equals(
archiveClasspaths[j].toString())){
classPath.append(sep);
classPath.append(localArchives[i]
.toString());
}
}
}
}
}
//file paths
Path[] fileClasspaths = DistributedCache.getFileClassPaths(conf);
if (fileClasspaths!=null && files != null) {
Path[] localFiles = DistributedCache
.getLocalCacheFiles(conf);
if (localFiles != null) {
for (int i = 0; i < files.length; i++) {
for (int j = 0; j < fileClasspaths.length; j++) {
if (files[i].getPath().equals(
fileClasspaths[j].toString())) {
classPath.append(sep);
classPath.append(localFiles[i].toString());
}
}
}
}
}
classPath.append(sep);
classPath.append(workDir);
// Build exec child jmv args.
Vector<String> vargs = new Vector<String>(8);
File jvm = // use same jvm as parent
new File(new File(System.getProperty("java.home"), "bin"), "java");
vargs.add(jvm.toString());
// Add child (task) java-vm options.
//
// The following symbols if present in mapred.child.java.opts value are
// replaced:
// + @taskid@ is interpolated with value of TaskID.
// Other occurrences of @ will not be altered.
//
// Example with multiple arguments and substitutions, showing
// jvm GC logging, and start of a passwordless JVM JMX agent so can
// connect with jconsole and the likes to watch child memory, threads
// and get thread dumps.
//
// <property>
// <name>mapred.child.java.opts</name>
// <value>-verbose:gc -Xloggc:/tmp/@taskid@.gc \
// -Dcom.sun.management.jmxremote.authenticate=false \
// -Dcom.sun.management.jmxremote.ssl=false \
// </value>
// </property>
//
String javaOpts = conf.get("mapred.child.java.opts", "-Xmx200m");
javaOpts = javaOpts.replace("@taskid@", taskid.toString());
String [] javaOptsSplit = javaOpts.split(" ");
// Add java.library.path; necessary for loading native libraries.
//
// 1. To support native-hadoop library i.e. libhadoop.so, we add the
// parent processes' java.library.path to the child.
// 2. We also add the 'cwd' of the task to it's java.library.path to help
// users distribute native libraries via the DistributedCache.
// 3. The user can also specify extra paths to be added to the
// java.library.path via mapred.child.java.opts.
//
String libraryPath = System.getProperty("java.library.path");
if (libraryPath == null) {
libraryPath = workDir.getAbsolutePath();
} else {
libraryPath += sep + workDir;
}
boolean hasUserLDPath = false;
for(int i=0; i<javaOptsSplit.length ;i++) {
if(javaOptsSplit[i].startsWith("-Djava.library.path=")) {
javaOptsSplit[i] += sep + libraryPath;
hasUserLDPath = true;
break;
}
}
if(!hasUserLDPath) {
vargs.add("-Djava.library.path=" + libraryPath);
}
for (int i = 0; i < javaOptsSplit.length; i++) {
vargs.add(javaOptsSplit[i]);
}
// add java.io.tmpdir given by mapred.child.tmp
String tmp = conf.get("mapred.child.tmp", "./tmp");
Path tmpDir = new Path(tmp);
// if temp directory path is not absolute
// prepend it with workDir.
if (!tmpDir.isAbsolute()) {
tmpDir = new Path(workDir.toString(), tmp);
}
FileSystem localFs = FileSystem.getLocal(conf);
if (!localFs.mkdirs(tmpDir) && !localFs.getFileStatus(tmpDir).isDir()) {
throw new IOException("Mkdirs failed to create " + tmpDir.toString());
}
vargs.add("-Djava.io.tmpdir=" + tmpDir.toString());
// Add classpath.
vargs.add("-classpath");
vargs.add(classPath.toString());
// Setup the log4j prop
long logSize = TaskLog.getTaskLogLength(conf);
vargs.add("-Dhadoop.log.dir=" +
new File(System.getProperty("hadoop.log.dir")
).getAbsolutePath());
vargs.add("-Dhadoop.root.logger=INFO,TLA");
vargs.add("-Dhadoop.tasklog.taskid=" + taskid);
vargs.add("-Dhadoop.tasklog.totalLogFileSize=" + logSize);
if (conf.getProfileEnabled()) {
if (conf.getProfileTaskRange(t.isMapTask()
).isIncluded(t.getPartition())) {
File prof = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.PROFILE);
vargs.add(String.format(conf.getProfileParams(), prof.toString()));
}
}
// Add main class and its arguments
vargs.add(Child.class.getName()); // main of Child
// pass umbilical address
InetSocketAddress address = tracker.getTaskTrackerReportAddress();
vargs.add(address.getAddress().getHostAddress());
vargs.add(Integer.toString(address.getPort()));
vargs.add(taskid.toString()); // pass task identifier
String pidFile = lDirAlloc.getLocalPathForWrite(
(TaskTracker.getPidFile(t.getJobID().toString(),
taskid.toString(), t.isTaskCleanupTask())),
this.conf).toString();
t.setPidFile(pidFile);
tracker.addToMemoryManager(t.getTaskID(), t.isMapTask(), conf, pidFile);
// set memory limit using ulimit if feasible and necessary ...
String[] ulimitCmd = Shell.getUlimitMemoryCommand(conf);
List<String> setup = null;
if (ulimitCmd != null) {
setup = new ArrayList<String>();
for (String arg : ulimitCmd) {
setup.add(arg);
}
}
// Set up the redirection of the task's stdout and stderr streams
File stdout = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.STDOUT);
File stderr = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.STDERR);
stdout.getParentFile().mkdirs();
tracker.getTaskTrackerInstrumentation().reportTaskLaunch(taskid, stdout, stderr);
Map<String, String> env = new HashMap<String, String>();
StringBuffer ldLibraryPath = new StringBuffer();
ldLibraryPath.append(workDir.toString());
String oldLdLibraryPath = null;
oldLdLibraryPath = System.getenv("LD_LIBRARY_PATH");
if (oldLdLibraryPath != null) {
ldLibraryPath.append(sep);
ldLibraryPath.append(oldLdLibraryPath);
}
env.put("LD_LIBRARY_PATH", ldLibraryPath.toString());
jvmManager.launchJvm(this,
jvmManager.constructJvmEnv(setup,vargs,stdout,stderr,logSize,
workDir, env, pidFile, conf));
synchronized (lock) {
while (!done) {
lock.wait();
}
}
tracker.getTaskTrackerInstrumentation().reportTaskEnd(t.getTaskID());
if (exitCodeSet) {
if (!killed && exitCode != 0) {
if (exitCode == 65) {
tracker.getTaskTrackerInstrumentation().taskFailedPing(t.getTaskID());
}
throw new IOException("Task process exit with nonzero status of " +
exitCode + ".");
}
}
} catch (FSError e) {
LOG.fatal("FSError", e);
try {
tracker.fsError(t.getTaskID(), e.getMessage());
} catch (IOException ie) {
LOG.fatal(t.getTaskID()+" reporting FSError", ie);
}
} catch (Throwable throwable) {
LOG.warn(t.getTaskID()+" Child Error", throwable);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
throwable.printStackTrace(new PrintStream(baos));
try {
tracker.reportDiagnosticInfo(t.getTaskID(), baos.toString());
} catch (IOException e) {
LOG.warn(t.getTaskID()+" Reporting Diagnostics", e);
}
} finally {
try{
URI[] archives = DistributedCache.getCacheArchives(conf);
URI[] files = DistributedCache.getCacheFiles(conf);
if (archives != null){
for (int i = 0; i < archives.length; i++){
DistributedCache.releaseCache(archives[i], conf);
}
}
if (files != null){
for(int i = 0; i < files.length; i++){
DistributedCache.releaseCache(files[i], conf);
}
}
}catch(IOException ie){
LOG.warn("Error releasing caches : Cache files might not have been cleaned up");
}
tip.reportTaskFinished();
}
}
10 jvmManager.launchJvm
public void launchJvm(TaskRunner t, JvmEnv env) {
if (t.getTask().isMapTask()) {
mapJvmManager.reapJvm(t, env);
} else {
reduceJvmManager.reapJvm(t, env);
}
}
11 mapJvmManager.reapJvm()
private synchronized void reapJvm(
TaskRunner t, JvmEnv env) {
if (t.getTaskInProgress().wasKilled()) {
//the task was killed in-flight
//no need to do the rest of the operations
return;
}
boolean spawnNewJvm = false;
JobID jobId = t.getTask().getJobID();
//Check whether there is a free slot to start a new JVM.
//,or, Kill a (idle) JVM and launch a new one
//When this method is called, we *must*
// (1) spawn a new JVM (if we are below the max)
// (2) find an idle JVM (that belongs to the same job), or,
// (3) kill an idle JVM (from a different job)
// (the order of return is in the order above)
int numJvmsSpawned = jvmIdToRunner.size();
JvmRunner runnerToKill = null;
if (numJvmsSpawned >= maxJvms) {
//go through the list of JVMs for all jobs.
Iterator<Map.Entry<JVMId, JvmRunner>> jvmIter =
jvmIdToRunner.entrySet().iterator();
while (jvmIter.hasNext()) {
JvmRunner jvmRunner = jvmIter.next().getValue();
JobID jId = jvmRunner.jvmId.getJobId();
//look for a free JVM for this job; if one exists then just break
if (jId.equals(jobId) && !jvmRunner.isBusy() && !jvmRunner.ranAll()){
setRunningTaskForJvm(jvmRunner.jvmId, t); //reserve the JVM
LOG.info("No new JVM spawned for jobId/taskid: " +
jobId+"/"+t.getTask().getTaskID() +
". Attempting to reuse: " + jvmRunner.jvmId);
return;
}
//Cases when a JVM is killed:
// (1) the JVM under consideration belongs to the same job
// (passed in the argument). In this case, kill only when
// the JVM ran all the tasks it was scheduled to run (in terms
// of count).
// (2) the JVM under consideration belongs to a different job and is
// currently not busy
//But in both the above cases, we see if we can assign the current
//task to an idle JVM (hence we continue the loop even on a match)
if ((jId.equals(jobId) && jvmRunner.ranAll()) ||
(!jId.equals(jobId) && !jvmRunner.isBusy())) {
runnerToKill = jvmRunner;
spawnNewJvm = true;
}
}
} else {
spawnNewJvm = true;
}
if (spawnNewJvm) {
if (runnerToKill != null) {
LOG.info("Killing JVM: " + runnerToKill.jvmId);
runnerToKill.kill();
}
spawnNewJvm(jobId, env, t);
return;
}
//*MUST* never reach this
throw new RuntimeException("Inconsistent state!!! " +
"JVM Manager reached an unstable state " +
"while reaping a JVM for task: " + t.getTask().getTaskID()+
" " + getDetails());
}
12 spawnNewJvm(jobId, env, t);
private void spawnNewJvm(JobID jobId, JvmEnv env,
TaskRunner t) {
JvmRunner jvmRunner = new JvmRunner(env,jobId);
jvmIdToRunner.put(jvmRunner.jvmId, jvmRunner);
//spawn the JVM in a new thread. Note that there will be very little
//extra overhead of launching the new thread for a new JVM since
//most of the cost is involved in launching the process. Moreover,
//since we are going to be using the JVM for running many tasks,
//the thread launch cost becomes trivial when amortized over all
//tasks. Doing it this way also keeps code simple.
jvmRunner.setDaemon(true);
jvmRunner.setName("JVM Runner " + jvmRunner.jvmId + " spawned.");
setRunningTaskForJvm(jvmRunner.jvmId, t);
LOG.info(jvmRunner.getName());
jvmRunner.start();
}
13 jvmRunner.run()
public void run() {
runChild(env);
}
public void runChild(JvmEnv env) {
try {
env.vargs.add(Integer.toString(jvmId.getId()));
List<String> wrappedCommand =
TaskLog.captureOutAndError(env.setup, env.vargs, env.stdout, env.stderr,
env.logSize, env.pidFile);
shexec = new ShellCommandExecutor(wrappedCommand.toArray(new String[0]),
env.workDir, env.env);
shexec.execute();
} catch (IOException ioe) {
// do nothing
// error and output are appropriately redirected
} finally { // handle the exit code
if (shexec == null) {
return;
}
int exitCode = shexec.getExitCode();
updateOnJvmExit(jvmId, exitCode, killed);
LOG.info("JVM : " + jvmId +" exited. Number of tasks it ran: " +
numTasksRan);
try {
// In case of jvm-reuse,
//the task jvm cleans up the common workdir for every
//task at the beginning of each task in the task JVM.
//For the last task, we do it here.
if (env.conf.getNumTasksToExecutePerJvm() != 1) {
FileUtil.fullyDelete(env.workDir);
}
} catch (IOException ie){}
}
}
14 shexec.execute();
public void execute() throws IOException {
this.run();
}
protected void run() throws IOException {
if (lastTime + interval > System.currentTimeMillis())
return;
exitCode = 0; // reset for next run
runCommand();
}
private void runCommand() throws IOException {
ProcessBuilder builder = new ProcessBuilder(getExecString());
boolean completed = false;
if (environment != null) {
builder.environment().putAll(this.environment);
}
if (dir != null) {
builder.directory(this.dir);
}
process = builder.start();
final BufferedReader errReader =
new BufferedReader(new InputStreamReader(process
.getErrorStream()));
BufferedReader inReader =
new BufferedReader(new InputStreamReader(process
.getInputStream()));
final StringBuffer errMsg = new StringBuffer();
// read error and input streams as this would free up the buffers
// free the error stream buffer
Thread errThread = new Thread() {
@Override
public void run() {
try {
String line = errReader.readLine();
while((line != null) && !isInterrupted()) {
errMsg.append(line);
errMsg.append(System.getProperty("line.separator"));
line = errReader.readLine();
}
} catch(IOException ioe) {
LOG.warn("Error reading the error stream", ioe);
}
}
};
try {
errThread.start();
} catch (IllegalStateException ise) { }
try {
parseExecResult(inReader); // parse the output
// clear the input stream buffer
String line = inReader.readLine();
while(line != null) {
line = inReader.readLine();
}
// wait for the process to finish and check the exit code
exitCode = process.waitFor();
try {
// make sure that the error thread exits
errThread.join();
} catch (InterruptedException ie) {
LOG.warn("Interrupted while reading the error stream", ie);
}
completed = true;
if (exitCode != 0) {
throw new ExitCodeException(exitCode, errMsg.toString());
}
} catch (InterruptedException ie) {
throw new IOException(ie.toString());
} finally {
// close the input stream
try {
inReader.close();
} catch (IOException ioe) {
LOG.warn("Error while closing the input stream", ioe);
}
if (!completed) {
errThread.interrupt();
}
try {
errReader.close();
} catch (IOException ioe) {
LOG.warn("Error while closing the error stream", ioe);
}
process.destroy();
lastTime = System.currentTimeMillis();
}
}