一、job提交过程源码分析
1. 调用job.waitForCompletion(true)源码查看
/**
* 1. @Param verbose:true表示将运行进度等信息及时输出给用户,false的话只是等待作业结束
* 2. public static enum JobState {DEFINE,RUNNING}:当且仅当Job对象处于DEFINE状态,才可以用来设置作业的一些配置,
* 如 ReduceTask的数量,InputFormat类,Mapper类,Reducer类等,当Job被submit之后,Job对象的状态就为RUNNING,
* 这时候就 无法设置作业的配置,作业处于调度运行阶段,处于RUNNING状态的Job可以获取map task和reduce task的进度
* 3.该过程包括: submit()、 monitorAndPrintJob()等
*/
public boolean waitForCompletion(boolean verbose) throws IOException, InterruptedException,
ClassNotFoundException {
if (state == JobState.DEFINE) {
submit(); // 核心流程
}
//上述方法将job提交以后,执行下面的监控方法
if (verbose) {
monitorAndPrintJob(); //监控job运行状态,向控制台输出执行日志
} else {
// get the completion poll interval from the client.
int completionPollIntervalMillis =
Job.getCompletionPollInterval(cluster.getConf());
while (!isComplete()) { //isComplete:循环检测集群是否完成工作
try {
Thread.sleep(completionPollIntervalMillis);
} catch (InterruptedException ie) {
}
}
}
return isSuccessful();
}
2. submit()源码查看
/**
* 1. ensureState(JobState.DEFINE):判断当前状态是否是DEFINE状态,直接返回
* 2. connect():
* 3.该过程包括: submit()、 monitorAndPrintJob()等
*/
public void submit() throws IOException, InterruptedException, ClassNotFoundException {
ensureState(JobState.DEFINE); //判断当前状态是否是DEFINE状态,直接返回
setUseNewAPI(); //将属性转换成支持的属性,兼容性
connect();//创建客户端与集群的连接(远程或者本地)参见3
//getJobSubmitter将详细讲解参见4
final JobSubmitter submitter =
getJobSubmitter(cluster.getFileSystem(), cluster.getClient());
//getJobSubmitter将详细讲解参见5:job的提交过程
status = ugi.doAs(new PrivilegedExceptionAction<JobStatus>() {
public JobStatus run() throws IOException, InterruptedException,
ClassNotFoundException {
return submitter.submitJobInternal(Job.this, cluster);
}
});
state = JobState.RUNNING; //将job状态设置成运行状态
LOG.info("The url to track the job: " + getTrackingURL());
}
============================ensureState(JobState.DEFINE)===================
private void ensureState(JobState state) throws IllegalStateException {
if (state != this.state) {
throw new IllegalStateException("Job in state "+ this.state +
" instead of " + state);
}
if (state == JobState.RUNNING && cluster == null) {
throw new IllegalStateException
("Job in state " + this.state
+ ", but it isn't attached to any job tracker!");
}
}
===========================setUseNewAPI()=====================================
private void setUseNewAPI() throws IOException {
int numReduces = conf.getNumReduceTasks();
String oldMapperClass = "mapred.mapper.class";
String oldReduceClass = "mapred.reducer.class";
conf.setBooleanIfUnset("mapred.mapper.new-api",
conf.get(oldMapperClass) == null);
if (conf.getUseNewMapper()) {
String mode = "new map API";
ensureNotSet("mapred.input.format.class", mode);
ensureNotSet(oldMapperClass, mode);
if (numReduces != 0) {
ensureNotSet("mapred.partitioner.class", mode);
} else {
ensureNotSet("mapred.output.format.class", mode);
}
} else {
String mode = "map compatability";
ensureNotSet(INPUT_FORMAT_CLASS_ATTR, mode);
ensureNotSet(MAP_CLASS_ATTR, mode);
if (numReduces != 0) {
ensureNotSet(PARTITIONER_CLASS_ATTR, mode);
} else {
ensureNotSet(OUTPUT_FORMAT_CLASS_ATTR, mode);
}
}
if (numReduces != 0) {
conf.setBooleanIfUnset("mapred.reducer.new-api",
conf.get(oldReduceClass) == null);
if (conf.getUseNewReducer()) {
String mode = "new reduce API";
ensureNotSet("mapred.output.format.class", mode);
ensureNotSet(oldReduceClass, mode);
} else {
String mode = "reduce compatability";
ensureNotSet(OUTPUT_FORMAT_CLASS_ATTR, mode);
ensureNotSet(REDUCE_CLASS_ATTR, mode);
}
}
}
3.connect()源码
private synchronized void connect()
throws IOException, InterruptedException, ClassNotFoundException {
if (cluster == null) {//如果当前状态下集群不存在则创建代理集群
cluster =
ugi.doAs(new PrivilegedExceptionAction<Cluster>() {
public Cluster run()
throws IOException, InterruptedException,
ClassNotFoundException {
return new Cluster(getConfiguration());
}
});
}
}
===================================new Cluster(getConfiguration())==========================
public Cluster(Configuration conf) throws IOException {
this(null, conf);
}
================================== this(null, conf)========================================
public Cluster(InetSocketAddress jobTrackAddr, Configuration conf)
throws IOException {
this.conf = conf;
this.ugi = UserGroupInformation.getCurrentUser();
initialize(jobTrackAddr, conf); // 初始化:判断实在本地运行job还是提交到远程运行
}
====================================initialize(jobTrackAddr, conf)========================
/**
* jobTrackAddr:job的地址:
* conf:一些配置
*/
private void initialize(InetSocketAddress jobTrackAddr, Configuration conf)
throws IOException {
synchronized (frameworkLoader) {
for (ClientProtocolProvider provider : frameworkLoader) { //遍历客户端协议
LOG.debug("Trying ClientProtocolProvider : "
+ provider.getClass().getName());
ClientProtocol clientProtocol = null;
try {
if (jobTrackAddr == null) {//如果jobTrackAddr为空则此时为空本地运行job
clientProtocol = provider.create(conf);
} else {//此时是会将job提交至远程
clientProtocol = provider.create(jobTrackAddr, conf);
}
if (clientProtocol != null) {
clientProtocolProvider = provider;
client = clientProtocol;
LOG.debug("Picked " + provider.getClass().getName()
+ " as the ClientProtocolProvider");
break;
}
else {
LOG.debug("Cannot pick " + provider.getClass().getName()
+ " as the ClientProtocolProvider - returned null protocol");
}
}
catch (Exception e) {
LOG.info("Failed to use " + provider.getClass().getName()
+ " due to error: " + e.getMessage());
}
}
}
if (null == clientProtocolProvider || null == client) {
throw new IOException(
"Cannot initialize Cluster. Please check your configuration for "
+ MRConfig.FRAMEWORK_NAME
+ " and the correspond server addresses.");
}
}
4. getJobSubmitter(cluster.getFileSystem(), cluster.getClient())源码
/**
* 根据初始化得到的cluster对象生成JobSubmitter对象(job的提交对象)
*/
public JobSubmitter getJobSubmitter(FileSystem fs,
ClientProtocol submitClient) throws IOException {
return new JobSubmitter(fs, submitClient);
}
==========================new JobSubmitter(fs, submitClient)================================
JobSubmitter(FileSystem submitFs, ClientProtocol submitClient)
throws IOException {
this.submitClient = submitClient;
this.jtFs = submitFs;
}
5. job的提交:submitter.submitJobInternal(Job.this, cluster)
/**
* 1. 检查job的输入输出规范:checkSpecs(job);
* 2. 创建给集群提交数据的Stag路径:JobSubmissionFiles.getStagingDir(cluster, conf);
* 3. 正确配置命令行选项;
* 4. 获取jobid:submitClient.getNewJobID() ,并创建job路径:new Path(jobStagingArea, jobId.toString());
* 5.
*
*
*
*/
JobStatus submitJobInternal(Job job, Cluster cluster)
throws ClassNotFoundException, InterruptedException, IOException {
//1. 校验job输出的规范
checkSpecs(job);
Configuration conf = job.getConfiguration();
addMRFrameworkToDistributedCache(conf);
//2. 创建给集群提交数据的Stag路径
Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);
//3. 正确配置命令行选项
InetAddress ip = InetAddress.getLocalHost();
if (ip != null) {
submitHostAddress = ip.getHostAddress();
submitHostName = ip.getHostName();
conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);
conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);
}
//4. 获取jobid ,并创建job路径
JobID jobId = submitClient.getNewJobID();
job.setJobID(jobId);
Path submitJobDir = new Path(jobStagingArea, jobId.toString());
JobStatus status = null;
// 下面是一些列的配合及令牌的生成与校验
try {
conf.set(MRJobConfig.USER_NAME,
UserGroupInformation.getCurrentUser().getShortUserName());
conf.set("hadoop.http.filter.initializers",
"org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");
conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString());
LOG.debug("Configuring job " + jobId + " with " + submitJobDir
+ " as the submit dir");
// 获取job目录的委托令牌
TokenCache.obtainTokensForNamenodes(job.getCredentials(),
new Path[] { submitJobDir }, conf);
populateTokenCache(conf, job.getCredentials());
// 生成秘钥来验证数据传输
if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {
KeyGenerator keyGen;
try {
keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);
keyGen.init(SHUFFLE_KEY_LENGTH);
} catch (NoSuchAlgorithmException e) {
throw new IOException("Error generating shuffle secret key", e);
}
SecretKey shuffleKey = keyGen.generateKey();
TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),
job.getCredentials());
}
// 5. 将job相关的配置文件上传至集群
copyAndConfigureFiles(job, submitJobDir);
Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);
// 6. 切片
LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));
int maps = writeSplits(job, submitJobDir);
conf.setInt(MRJobConfig.NUM_MAPS, maps);
LOG.info("number of splits:" + maps);
// 7. 将“正在提交作业的队列的队列管理员”写入job文件。
String queue = conf.get(MRJobConfig.QUEUE_NAME,
JobConf.DEFAULT_QUEUE_NAME);
AccessControlList acl = submitClient.getQueueAdmins(queue);
conf.set(toFullPropertyName(queue,
QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());
// removing jobtoken referrals before copying the jobconf to HDFS
// as the tasks don't need this setting, actually they may break
// because of it if present as the referral will point to a
// different job.
TokenCache.cleanUpTokenReferral(conf);
if (conf.getBoolean(
MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,
MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {
// Add HDFS tracking ids
ArrayList<String> trackingIds = new ArrayList<String>();
for (Token<? extends TokenIdentifier> t :
job.getCredentials().getAllTokens()) {
trackingIds.add(t.decodeIdentifier().getTrackingId());
}
conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,
trackingIds.toArray(new String[trackingIds.size()]));
}
// 8. 向Stag路径写xml配置文件
writeConf(conf, submitJobFile);
//
// Now, actually submit the job (using the submit name)
//
printTokens(jobId, job.getCredentials());
// 9.提交job,返回提交状态
status = submitClient.submitJob(
jobId, submitJobDir.toString(), job.getCredentials());
if (status != null) {
return status;
} else {
throw new IOException("Could not launch job");
}
} finally {
if (status == null) {
LOG.info("Cleaning up the staging area " + submitJobDir);
if (jtFs != null && submitJobDir != null)
jtFs.delete(submitJobDir, true);
}
}
}
二、job提交过程整理
1. job提交流程源码详解
- waitForCompletion(verbose)
- submit();
- ensureState(JobState.DEFINE); // 判断昂前job的状态
- setUseNewAPI(); //将属性转换成支持的属性,兼容性
- connect(); // 1建立连接
- new Cluster(getConfiguration()); // 1)创建提交job的代理
- initialize(jobTrackAddr, conf); // (1)判断是本地yarn还是远程
- submitter.submitJobInternal(Job.this, cluster); // 2 提交job
- Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf); // 1)创建给集群提交数据的Stag路径
- JobID jobId = submitClient.getNewJobID(); // 2)获取jobid ,并创建job路径
- copyAndConfigureFiles(job, submitJobDir); // 3)拷贝相关jar到集群
- rUploader.uploadFiles(job, jobSubmitDir);
- writeSplits(job, submitJobDir);// 4)计算切片,生成切片规划文件
- maps = writeNewSplits(job, jobSubmitDir);
- writeConf(conf, submitJobFile); 5)向Stag路径写xml配置文件
- status = submitClient.submitJob(jobId, submitJobDir.toString(), job.getCredentials());// 6)提交job,返回提交状态
- monitorAndPrintJob():当前提交模式是按照"运行进度等信息及时输出给用户:verbose=true"时执行该方法
- 如果当前提交模式是按照"等待作业结束:verbose=false:则执行Job.getCompletionPollInterval(cluster.getConf());定时查看该job的状态,知道job完成在输出信息