作业的提交
jobclient请求jobtracker获取jobid,生成相关hdfs目录,计算输入分片信息,生成jar,xml,split文件,copy至jobtracker文件系统,最后rpc远程调用jobtracker的submitJob方法
job.waitForCompletion方法
public boolean waitForCompletion(boolean verbose
) throws IOException, InterruptedException,
ClassNotFoundException {
if (state == JobState.DEFINE) {
submit();//提交
}
if (verbose) {
jobClient.monitorAndPrintJob(conf, info);//每秒轮询作业进度,如果发现自上次报告有变化,则报告控制台
} else {
info.waitForCompletion();
}
return isSuccessful();
}
submit方法:
public void submit() throws IOException, InterruptedException,
ClassNotFoundException {
ensureState(JobState.DEFINE);
setUseNewAPI();
info = jobClient.submitJobInternal(conf);
state = JobState.RUNNING;
}
jobClient.submitJobInternal方法核心部分:
public
RunningJob submitJobInternal(JobConf job
) throws FileNotFoundException,
ClassNotFoundException,
InterruptedException,
IOException {
/*
* configure the command line options correctly on the submitting dfs
*/
JobID jobId = jobSubmitClient.getNewJobId();
Path submitJobDir = new Path(getSystemDir(), jobId.toString());
Path submitJarFile = new Path(submitJobDir, "job.jar");
Path submitSplitFile = new Path(submitJobDir, "job.split");
configureCommandLineOptions(job, submitJobDir, submitJarFile); //将jar文件拷贝到jobtracker文件系统
Path submitJobFile = new Path(submitJobDir, "job.xml");
int reduces = job.getNumReduceTasks();
JobContext context = new JobContext(job, jobId);
// Check the output specification
if (reduces == 0 ? job.getUseNewMapper() : job.getUseNewReducer()) {
org.apache.hadoop.mapreduce.OutputFormat<?,?> output =
ReflectionUtils.newInstance(context.getOutputFormatClass(), job);
output.checkOutputSpecs(context); //调用Fileoutputformat的checkOutputSpecs方法,如果输出目录未指定,或已经存在报异常
} else {
job.getOutputFormat().checkOutputSpecs(fs, job);
}
// Create the splits for the job
LOG.debug("Creating splits at " + fs.makeQualified(submitSplitFile));
int maps;
if (job.getUseNewMapper()) { //区分是老的mapreduce api 还是新的mapreduce api
maps = writeNewSplits(context, submitSplitFile); //生成分片信息并copy至job文件系统
} else {
maps = writeOldSplits(job, submitSplitFile);
}
job.set("mapred.job.split.file", submitSplitFile.toString());
job.setNumMapTasks(maps);
// Write job file to JobTracker's fs
FSDataOutputStream out =
FileSystem.create(fs, submitJobFile,
new FsPermission(JOB_FILE_PERMISSION));
try {
job.writeXml(out);//将job.xml copy到jobtracker文件系统
} finally {
out.close();
}
//
// Now, actually submit the job (using the submit name)
//
JobStatus status = jobSubmitClient.submitJob(jobId);//实际提交job,远程调用jobtracker的submit方法
if (status != null) {
return new NetworkedJob(status);
} else {
throw new IOException("Could not launch job");
}
}
writeNewSplits方法
private <T extends org.apache.hadoop.mapreduce.InputSplit>
int writeNewSplits(JobContext job, Path submitSplitFile
) throws IOException, InterruptedException,
ClassNotFoundException {
JobConf conf = job.getJobConf();
org.apache.hadoop.mapreduce.InputFormat<?,?> input =
ReflectionUtils.newInstance(job.getInputFormatClass(), job.getJobConf());
List<org.apache.hadoop.mapreduce.InputSplit> splits = input.getSplits(job);//调用相应的inputformat计算分片信息
T[] array = (T[])
splits.toArray(new org.apache.hadoop.mapreduce.InputSplit[splits.size()]);
//split列表
// sort the splits into order based on size, so that the biggest
// go first
Arrays.sort(array, new NewSplitComparator());
DataOutputStream out = writeSplitsFileHeader(conf, submitSplitFile,
array.length);//创建split输出目录,返回输出流
try {
if (array.length != 0) {
DataOutputBuffer buffer = new DataOutputBuffer();
RawSplit rawSplit = new RawSplit();
SerializationFactory factory = new SerializationFactory(conf);
Serializer<T> serializer =
factory.getSerializer((Class<T>) array[0].getClass());
serializer.open(buffer);
for(T split: array) {
rawSplit.setClassName(split.getClass().getName());
buffer.reset();
serializer.serialize(split);//序列化split数据到buffer,最终调用的是org.apache.hadoop.io.serialize.WritableSerialization中的serialize方法
rawSplit.setDataLength(split.getLength());
rawSplit.setBytes(buffer.getData(), 0, buffer.getLength()); //将buffer数据赋值到rawSplit
rawSplit.setLocations(split.getLocations());
rawSplit.write(out);//写出rawsplit数据至jobtracker端
}
serializer.close();
}
} finally {
out.close();
}
return array.length;
}