MapReduce作业提交查看源码
1、判断job的状态
//将private Job.JobState state的值修改为DEFINE,然后执行submit()方法;
public boolean waitForCompletion(boolean verbose) throws IOException, InterruptedException, ClassNotFoundException {
if (this.state == Job.JobState.DEFINE) {
this.submit();
}
if (verbose) {
this.monitorAndPrintJob();
} else {
int completionPollIntervalMillis = getCompletionPollInterval(this.cluster.getConf());
while(!this.isComplete()) {
try {
Thread.sleep((long)completionPollIntervalMillis);
} catch (InterruptedException var4) {
}
}
}
return this.isSuccessful();
}
2、确认job的状态是否是DEFINE
//执行this.ensureState(Job.JobState.DEFINE);代码判断job的状态;
public void submit() throws IOException, InterruptedException, ClassNotFoundException {
this.ensureState(Job.JobState.DEFINE);
this.setUseNewAPI();
this.connect();
final JobSubmitter submitter = this.getJobSubmitter(this.cluster.getFileSystem(), this.cluster.getClient());
this.status = (JobStatus)this.ugi.doAs(new PrivilegedExceptionAction<JobStatus>() {
public JobStatus run() throws IOException, InterruptedException, ClassNotFoundException {
return submitter.submitJobInternal(Job.this, Job.this.cluster);
}
});
this.state = Job.JobState.RUNNING;
LOG.info("The url to track the job: " + this.getTrackingURL());
}
3、执行ensureState方法确认job的状态是DEFINE
private void ensureState(Job.JobState state) throws IllegalStateException {
if (state != this.state) {
throw new IllegalStateException("Job in state " + this.state + " instead of " + state);
} else if (state == Job.JobState.RUNNING && this.cluster == null) {
throw new IllegalStateException("Job in state " + this.state + ", but it isn't attached to any job tracker!");
}
}
4、执行setUseNewAPI方法切换新的API
private void setUseNewAPI() throws IOException {
int numReduces = this.conf.getNumReduceTasks();
String oldMapperClass = "mapred.mapper.class";
String oldReduceClass = "mapred.reducer.class";
this.conf.setBooleanIfUnset("mapred.mapper.new-api", this.conf.get(oldMapperClass) == null);
String mode;
if (this.conf.getUseNewMapper()) {
mode = "new map API";
this.ensureNotSet("mapred.input.format.class", mode);
this.ensureNotSet(oldMapperClass, mode);
if (numReduces != 0) {
this.ensureNotSet("mapred.partitioner.class", mode);
} else {
this.ensureNotSet("mapred.output.format.class", mode);
}
} else {
mode = "map compatibility";
this.ensureNotSet("mapreduce.job.inputformat.class", mode);
this.ensureNotSet("mapreduce.job.map.class", mode);
if (numReduces != 0) {
this.ensureNotSet("mapreduce.job.partitioner.class", mode);
} else {
this.ensureNotSet("mapreduce.job.outputformat.class", mode);
}
}
if (numReduces != 0) {
this.conf.setBooleanIfUnset("mapred.reducer.new-api", this.conf.get(oldReduceClass) == null);
if (this.conf.getUseNewReducer()) {
mode = "new reduce API";
this.ensureNotSet("mapred.output.format.class", mode);
this.ensureNotSet(oldReduceClass, mode);
} else {
mode = "reduce compatibility";
this.ensureNotSet("mapreduce.job.outputformat.class", mode);
this.ensureNotSet("mapreduce.job.reduce.class", mode);
}
}
}
//执行getNumReduceTasks方法获取ReduceTask的个数
public int getNumReduceTasks() {
//ReduceTask的数目的默认值是1
return this.getInt("mapreduce.job.reduces", 1);
}
5、执行connect方法
private synchronized void connect() throws IOException, InterruptedException, ClassNotFoundException {
if (this.cluster == null) {
this.cluster = (Cluster)this.ugi.doAs(new PrivilegedExceptionAction<Cluster>() {
public Cluster run() throws IOException, InterruptedException, ClassNotFoundException {
return new Cluster(Job.this.getConfiguration());
}
});
}
}
public Cluster(InetSocketAddress jobTrackAddr, Configuration conf) throws IOException {
this.fs = null;
this.sysDir = null;
this.stagingAreaDir = null;
this.jobHistoryDir = null;
this.providerList = null;
this.conf = conf;
//通过ugi获取到提交job的用户名
this.ugi = UserGroupInformation.getCurrentUser();
this.initialize(jobTrackAddr, conf);
}
Cluster构造器里面conf的值:
private void initialize(InetSocketAddress jobTrackAddr, Configuration conf) throws IOException {
this.initProviderList();
Iterator i$ = this.providerList.iterator();
while(i$.hasNext()) {
ClientProtocolProvider provider = (ClientProtocolProvider)i$.next();
LOG.debug("Trying ClientProtocolProvider : " + provider.getClass().getName());
ClientProtocol clientProtocol = null;
try {
if (jobTrackAddr == null) {
clientProtocol = provider.create(conf);
} else {
clientProtocol = provider.create(jobTrackAddr, conf);
}
if (clientProtocol != null) {
this.clientProtocolProvider = provider;
this.client = clientProtocol;
LOG.debug("Picked " + provider.getClass().getName() + " as the ClientProtocolProvider");
break;
}
LOG.debug("Cannot pick " + provider.getClass().getName() + " as the ClientProtocolProvider - returned null protocol");
} catch (Exception var7) {
LOG.info("Failed to use " + provider.getClass().getName() + " due to error: ", var7);
}
}
if (null == this.clientProtocolProvider || null == this.client) {
throw new IOException("Cannot initialize Cluster. Please check your configuration for mapreduce.framework.name and the correspond server addresses.");
}
}
通过ClientProtocolProvider能看到有两个子类一个是本地的一个是集群的:
6、创建job的提交路径
public synchronized FileSystem getFileSystem() throws IOException, InterruptedException {
if (this.fs == null) {
try {
this.fs = (FileSystem)this.ugi.doAs(new PrivilegedExceptionAction<FileSystem>() {
public FileSystem run() throws IOException, InterruptedException {
Path sysDir = new Path(Cluster.this.client.getSystemDir());
return sysDir.getFileSystem(Cluster.this.getConf());
}
});
} catch (InterruptedException var2) {
throw new RuntimeException(var2);
}
}
return this.fs;
}
public String getSystemDir() {
Path sysDir = new Path(this.conf.get("mapreduce.jobtracker.system.dir", "/tmp/hadoop/mapred/system"));
return this.fs.makeQualified(sysDir).toString();
}
通过getSystemDir方法拿到系统的提交job的路径
//将配置信息写到分布式缓存里面去,将生成的jobID返回出去;
JobStatus submitJobInternal(Job job, Cluster cluster) throws ClassNotFoundException, InterruptedException, IOException {
this.checkSpecs(job);
Configuration conf = job.getConfiguration();
//将配置信息添加到分布式缓存里
addMRFrameworkToDistributedCache(conf);
Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);
//获取本地IP
InetAddress ip = InetAddress.getLocalHost();
if (ip != null) {
this.submitHostAddress = ip.getHostAddress();
this.submitHostName = ip.getHostName();
conf.set("mapreduce.job.submithostname", this.submitHostName);
conf.set("mapreduce.job.submithostaddress", this.submitHostAddress);
}
//生成新的jobID
JobID jobId = this.submitClient.getNewJobID();
//生成的信息的jobID添加到job里
job.setJobID(jobId);
//生成存放job信息的路径
Path submitJobDir = new Path(jobStagingArea, jobId.toString());
JobStatus status = null;
JobStatus var24;
try {
//将相关的配置信息添加到缓存
conf.set("mapreduce.job.user.name", UserGroupInformation.getCurrentUser().getShortUserName());
conf.set("hadoop.http.filter.initializers", "org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");
conf.set("mapreduce.job.dir", submitJobDir.toString());
LOG.debug("Configuring job " + jobId + " with " + submitJobDir + " as the submit dir");
TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[]{submitJobDir}, conf);
this.populateTokenCache(conf, job.getCredentials());
if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {
KeyGenerator keyGen;
try {
keyGen = KeyGenerator.getInstance("HmacSHA1");
keyGen.init(64);
} catch (NoSuchAlgorithmException var19) {
throw new IOException("Error generating shuffle secret key", var19);
}
SecretKey shuffleKey = keyGen.generateKey();
TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(), job.getCredentials());
}
if (CryptoUtils.isEncryptedSpillEnabled(conf)) {
conf.setInt("mapreduce.am.max-attempts", 1);
LOG.warn("Max job attempts set to 1 since encrypted intermediatedata spill is enabled");
}
this.copyAndConfigureFiles(job, submitJobDir);
Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);
LOG.debug("Creating splits at " + this.jtFs.makeQualified(submitJobDir));
//根据生成切片文件个数确定maptask的数量
int maps = this.writeSplits(job, submitJobDir);
conf.setInt("mapreduce.job.maps", maps);
//这个maps的值是1
LOG.info("number of splits:" + maps);
String queue = conf.get("mapreduce.job.queuename", "default");
AccessControlList acl = this.submitClient.getQueueAdmins(queue);
conf.set(QueueManager.toFullPropertyName(queue, QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());
TokenCache.cleanUpTokenReferral(conf);
if (conf.getBoolean("mapreduce.job.token.tracking.ids.enabled", false)) {
ArrayList<String> trackingIds = new ArrayList();
Iterator i$ = job.getCredentials().getAllTokens().iterator();
while(i$.hasNext()) {
Token<? extends TokenIdentifier> t = (Token)i$.next();
trackingIds.add(t.decodeIdentifier().getTrackingId());
}
conf.setStrings("mapreduce.job.token.tracking.ids", (String[])trackingIds.toArray(new String[trackingIds.size()]));
}
ReservationId reservationId = job.getReservationId();
if (reservationId != null) {
conf.set("mapreduce.job.reservation.id", reservationId.toString());
}
//将相关的配置信息写到磁盘job.xml里面
this.writeConf(conf, submitJobFile);
this.printTokens(jobId, job.getCredentials());
status = this.submitClient.submitJob(jobId, submitJobDir.toString(), job.getCredentials());
if (status == null) {
throw new IOException("Could not launch job");
}
var24 = status;
} finally {
if (status == null) {
LOG.info("Cleaning up the staging area " + submitJobDir);
if (this.jtFs != null && submitJobDir != null) {
this.jtFs.delete(submitJobDir, true);
}
}
}
return var24;
}
private <T extends InputSplit> int writeNewSplits(JobContext job, Path jobSubmitDir) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = job.getConfiguration();
InputFormat<?, ?> input = (InputFormat)ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
List<InputSplit> splits = input.getSplits(job);
T[] array = (InputSplit[])((InputSplit[])splits.toArray(new InputSplit[splits.size()]));
Arrays.sort(array, new JobSubmitter.SplitComparator());
JobSplitWriter.createSplitFiles(jobSubmitDir, conf, jobSubmitDir.getFileSystem(conf), array);
return array.length;
}
//待处理的文件是一个小于128M的文件所以切片数是1:
生成的切片文件信息提交到相关路径
生成的ob.xml的信息:
7、更新job的状态为RUNNING
public void submit() throws IOException, InterruptedException, ClassNotFoundException {
this.ensureState(Job.JobState.DEFINE);
this.setUseNewAPI();
this.connect();
final JobSubmitter submitter = this.getJobSubmitter(this.cluster.getFileSystem(), this.cluster.getClient());
this.status = (JobStatus)this.ugi.doAs(new PrivilegedExceptionAction<JobStatus>() {
public JobStatus run() throws IOException, InterruptedException, ClassNotFoundException {
return submitter.submitJobInternal(Job.this, Job.this.cluster);
}
});
this.state = Job.JobState.RUNNING;
LOG.info("The url to track the job: " + this.getTrackingURL());
}
将state从DEFINE更新为RUNNNG;
public synchronized String getTrackingUrl() {
return this.trackingUrl;
}
获取到URL:
8、执行mapreduce任务
public boolean waitForCompletion(boolean verbose) throws IOException, InterruptedException, ClassNotFoundException {
//state的值是RUNNING跳过该代码块
if (this.state == Job.JobState.DEFINE) {
this.submit();
}
//verbose的值是true所以开始执行mapreduce任务
//
if (verbose) {
this.monitorAndPrintJob();
} else {
int completionPollIntervalMillis = getCompletionPollInterval(this.cluster.getConf());
while(!this.isComplete()) {
try {
Thread.sleep((long)completionPollIntervalMillis);
} catch (InterruptedException var4) {
}
}
}
return this.isSuccessful();
}