//从job提交开始
job.waitForCompletion(true);
waitForCompletion分析:
/**
* 提交代码到集群并返回结果
* @param verbose 是否打印过程信息
* @return true 成功返回true
* @throws IOException thrown if the communication with the
* <code>JobTracker</code> is lost
*/
public boolean waitForCompletion(boolean verbose
) throws IOException, InterruptedException,
ClassNotFoundException {
if (state == JobState.DEFINE) {
//提交job 见下文
submit();
}
if (verbose) {
monitorAndPrintJob(); //打印过程信息 略
} else {
// 获得检查job是否处理成功的轮寻时间间隔
int completionPollIntervalMillis =
Job.getCompletionPollInterval(cluster.getConf());
while (!isComplete()) {
try {
//如果没成功则sleep指定的时间
Thread.sleep(completionPollIntervalMillis);
} catch (InterruptedException ie) {
}
}
}
return isSuccessful();
}
submit分析:
/**
* 将job提交给集群并立即返回结果
* @throws IOException
*/
public void submit()
throws IOException, InterruptedException, ClassNotFoundException {
//验证状态一致 如出现job的状态与当前保存的状态不一致或者当前状态为运行态但cluster=null 都将报错
ensureState(JobState.DEFINE);
//设置新的API,hadoop2.x以后MR使用了新的API
setUseNewAPI();
connect();//获得cluster对象 见下文
//通过cluster获得负责提交job的对象submitter
final JobSubmitter submitter =
getJobSubmitter(cluster.getFileSystem(), cluster.getClient());
status = ugi.doAs(new PrivilegedExceptionAction<JobStatus>() {
public JobStatus run() throws IOException, InterruptedException,
ClassNotFoundException {
//提交工作的内部方法,返回job的状态 见下文
return submitter.submitJobInternal(Job.this, cluster);
}
});
state = JobState.RUNNING;
LOG.info("The url to track the job: " + getTrackingURL());
}
connect分析:
/**
* 获得cluster对象
*/
private synchronized void connect()
throws IOException, InterruptedException, ClassNotFoundException {
if (cluster == null) {
//PrivilegedExceptionAction为一个接口只包含一个run方法
//详情见下文
cluster =ugi.doAs(new PrivilegedExceptionAction<Cluster>() {
//run方法执行后的返回值作为doAs的返回值返回给cluster
public Cluster run()
throws IOException, InterruptedException,
ClassNotFoundException {
return new Cluster(getConfiguration());
}
});
}
}
doAs分析:
/**
* 运行指定的action
* @param <T> 指定返回值的类型
* @param 被执行的action
* @return 返回run方法的结果
*/
@InterfaceAudience.Public
@InterfaceStability.Evolving
public <T> T doAs(PrivilegedAction<T> action) {
logPrivilegedAction(subject, action);
return Subject.doAs(subject, action); //详细的执行过程不再追述
}
submitJobInternal分析:
/*
* 内部提交job的类
* @param job 被提交的工作
* @param cluster 集群类对象
* @throws ClassNotFoundException
* @throws InterruptedException
* @throws IOException
*/
JobStatus submitJobInternal(Job job, Cluster cluster)
throws ClassNotFoundException, InterruptedException, IOException {
//检查reduce的数量,默认等于map的个数,可在job中配置
//reduce的输出检查。例如:输出文件路径如果存在则报错
checkSpecs(job);
Configuration conf = job.getConfiguration();
//添加MR的框架到分布式系统的缓存中,此处理解不是很清楚,待进一步分析???
addMRFrameworkToDistributedCache(conf);
//获得客户端相关数据提交到hdfs的位置,该位置由Namenode返回
Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);
//configure the command line options correctly on the submitting dfs
InetAddress ip = InetAddress.getLocalHost();
if (ip != null) {
submitHostAddress = ip.getHostAddress();
submitHostName = ip.getHostName();
conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);
conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);
}
//获得jobId 同样由namenode返回
JobID jobId = submitClient.getNewJobID();
job.setJobID(jobId);
//将返回的存放路径与jobId拼接成一个独一无二的位置作为客户端数据的最终存放位置
Path submitJobDir = new Path(jobStagingArea, jobId.toString());
JobStatus status = null;
//往conf中设置相关字段信息
try {
conf.set(MRJobConfig.USER_NAME,
UserGroupInformation.getCurrentUser().getShortUserName());
conf.set("hadoop.http.filter.initializers",
"org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");
conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString());
LOG.debug("Configuring job " + jobId + " with " + submitJobDir
+ " as the submit dir");
// 获得授权令牌token ,令牌机制略
TokenCache.obtainTokensForNamenodes(job.getCredentials(),
new Path[] { submitJobDir }, conf);
populateTokenCache(conf, job.getCredentials());
// generate a secret to authenticate shuffle transfers
if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {
KeyGenerator keyGen;
try {
keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);
keyGen.init(SHUFFLE_KEY_LENGTH);
} catch (NoSuchAlgorithmException e) {
throw new IOException("Error generating shuffle secret key", e);
}
SecretKey shuffleKey = keyGen.generateKey();
TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),
job.getCredentials());
}
if (CryptoUtils.isEncryptedSpillEnabled(conf)) {
conf.setInt(MRJobConfig.MR_AM_MAX_ATTEMPTS, 1);
LOG.warn("Max job attempts set to 1 since encrypted intermediate" +
"data spill is enabled");
}
//配置jobConf
copyAndConfigureFiles(job, submitJobDir);
//文件的上传路径submitJobDir/job.xml
Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);
//创建map的个数 多少个split就对应多少个map
LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));
int maps = writeSplits(job, submitJobDir);
conf.setInt(MRJobConfig.NUM_MAPS, maps);
LOG.info("number of splits:" + maps);
// write "queue admins of the queue to which job is being submitted"
// to job file.
String queue = conf.get(MRJobConfig.QUEUE_NAME,
JobConf.DEFAULT_QUEUE_NAME);
AccessControlList acl = submitClient.getQueueAdmins(queue);
conf.set(toFullPropertyName(queue,
QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());
// removing jobtoken referrals before copying the jobconf to HDFS
// as the tasks don't need this setting, actually they may break
// because of it if present as the referral will point to a
// different job.
TokenCache.cleanUpTokenReferral(conf);
if (conf.getBoolean(
MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,
MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {
// Add HDFS tracking ids
ArrayList<String> trackingIds = new ArrayList<String>();
for (Token<? extends TokenIdentifier> t :
job.getCredentials().getAllTokens()) {
trackingIds.add(t.decodeIdentifier().getTrackingId());
}
conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,
trackingIds.toArray(new String[trackingIds.size()]));
}
// Set reservation info if it exists
ReservationId reservationId = job.getReservationId();
if (reservationId != null) {
conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString());
}
//将conf写入到job提交文件的路径下
writeConf(conf, submitJobFile);
printTokens(jobId, job.getCredentials());
//真正提交job的方法,submitClient是ClientProtocol的对象,该类为一个通信协议类
//ClientProtocol有两个实现:YARNRunner 和 LocalJobRunner 分别对应集群模式和本地模式
//详情见下文
status = submitClient.submitJob(
jobId, submitJobDir.toString(), job.getCredentials());
if (status != null) {
return status;
} else {
throw new IOException("Could not launch job");
}
} finally {
if (status == null) {
LOG.info("Cleaning up the staging area " + submitJobDir);
if (jtFs != null && submitJobDir != null)
jtFs.delete(submitJobDir, true);
}
}
}
YARNRunner.submitJob分析:
@Override
public JobStatus submitJob(JobID jobId, String jobSubmitDir, Credentials ts)
throws IOException, InterruptedException {
//添加授权令牌
addHistoryToken(ts);
// 得到任务提交的appContext
ApplicationSubmissionContext appContext =
createApplicationSubmissionContext(conf, jobSubmitDir, ts);
//将appContext提交到ResourceManager
try {
//resMgrDelegate是ResourceManager的一个代理,详情见下文
ApplicationId applicationId =resMgrDelegate.submitApplication(appContext);
//从resMgrDelegate获得ApplicationReport,从而得到任务运行的状态
ApplicationReport appMaster = resMgrDelegate
.getApplicationReport(applicationId);
String diagnostics =
(appMaster == null ?
"application report is null" : appMaster.getDiagnostics());
if (appMaster == null
|| appMaster.getYarnApplicationState() == YarnApplicationState.FAILED
|| appMaster.getYarnApplicationState() == YarnApplicationState.KILLED) {
throw new IOException("Failed to run job : " +
diagnostics);
}
return clientCache.getClient(jobId).getJobStatus(jobId);
} catch (YarnException e) {
throw new IOException(e);
}
}
submitApplication分析:
@Override
public ApplicationId
submitApplication(ApplicationSubmissionContext appContext)
throws YarnException, IOException {
//该client为YarnClient的对象,submitApplication将调用YarnClientImpl中的submitApplication
return client.submitApplication(appContext);
}
YarnClientImpl.submitApplication分析:
@Override
public ApplicationId
submitApplication(ApplicationSubmissionContext appContext)
throws YarnException, IOException {
ApplicationId applicationId = appContext.getApplicationId();
if (applicationId == null) {
throw new ApplicationIdNotProvidedException(
"ApplicationId is not provided in ApplicationSubmissionContext");
}
//将appContext封装到request中,最终提交的是这个request
SubmitApplicationRequest request =
Records.newRecord(SubmitApplicationRequest.class);
request.setApplicationSubmissionContext(appContext);
// Automatically add the timeline DT into the CLC
// Only when the security and the timeline service are both enabled
if (isSecurityEnabled() && timelineServiceEnabled) {
addTimelineDelegationToken(appContext.getAMContainerSpec());
}
//rmClient为ApplicationClientProtocol对象,ApplicationClientProtocol是协议接口
//submitApplication将application转交给ApplicationClientProtocolPBClientImpl并提交到ResourceManager处理
//ApplicationClientProtocolPBClientImpl.submitApplication方法见下文
rmClient.submitApplication(request);
int pollCount = 0;
long startTime = System.currentTimeMillis();
EnumSet<YarnApplicationState> waitingStates =
EnumSet.of(YarnApplicationState.NEW,
YarnApplicationState.NEW_SAVING,
YarnApplicationState.SUBMITTED);
EnumSet<YarnApplicationState> failToSubmitStates =
EnumSet.of(YarnApplicationState.FAILED,
YarnApplicationState.KILLED);
//一下代码为回去Report以及失败下多次提交任务
while (true) {
try {
ApplicationReport appReport = getApplicationReport(applicationId);
YarnApplicationState state = appReport.getYarnApplicationState();
if (!waitingStates.contains(state)) {
if(failToSubmitStates.contains(state)) {
throw new YarnException("Failed to submit " + applicationId +
" to YARN : " + appReport.getDiagnostics());
}
LOG.info("Submitted application " + applicationId);
break;
}
long elapsedMillis = System.currentTimeMillis() - startTime;
if (enforceAsyncAPITimeout() &&
elapsedMillis >= asyncApiPollTimeoutMillis) {
throw new YarnException("Timed out while waiting for application " +
applicationId + " to be submitted successfully");
}
// Notify the client through the log every 10 poll, in case the client
// is blocked here too long.
if (++pollCount % 10 == 0) {
LOG.info("Application submission is not finished, " +
"submitted application " + applicationId +
" is still in " + state);
}
try {
Thread.sleep(submitPollIntervalMillis);
} catch (InterruptedException ie) {
LOG.error("Interrupted while waiting for application "
+ applicationId
+ " to be successfully submitted.");
}
} catch (ApplicationNotFoundException ex) {
// FailOver or RM restart happens before RMStateStore saves
// ApplicationState
LOG.info("Re-submit application " + applicationId + "with the " +
"same ApplicationSubmissionContext");
rmClient.submitApplication(request);
}
}
return applicationId;
}
submitApplication分析:
@Override
public SubmitApplicationResponse submitApplication(
SubmitApplicationRequest request) throws YarnException,
IOException {
SubmitApplicationRequestProto requestProto =
((SubmitApplicationRequestPBImpl) request).getProto();
try {
//proxy为ApplicationClientProtocolPB的对象
//ApplicationClientProtocolPB为一个进程间通信的接口。客户端提交job的代码就追踪到这
return new SubmitApplicationResponsePBImpl(proxy.submitApplication(null,
requestProto));
} catch (ServiceException e) {
RPCUtil.unwrapAndThrowException(e);
return null;
}
}