mapreduce源码分析作业提交、初始化、分配、计算过程之提交篇

作业的提交

jobclient请求jobtracker获取jobid,生成相关hdfs目录,计算输入分片信息,生成jar,xml,split文件,copy至jobtracker文件系统,最后rpc远程调用jobtracker的submitJob方法

job.waitForCompletion方法

  public boolean waitForCompletion(boolean verbose
                                   ) throws IOException, InterruptedException,
                                            ClassNotFoundException {
    if (state == JobState.DEFINE) {
      submit();//提交
    }
    if (verbose) {
      jobClient.monitorAndPrintJob(conf, info);//每秒轮询作业进度,如果发现自上次报告有变化,则报告控制台
    } else {
      info.waitForCompletion();
    }
    return isSuccessful();
  }
  


submit方法:

  public void submit() throws IOException, InterruptedException, 
                              ClassNotFoundException {
    ensureState(JobState.DEFINE);
    setUseNewAPI();
    info = jobClient.submitJobInternal(conf);
    state = JobState.RUNNING;
   }
  


 jobClient.submitJobInternal方法核心部分:

public 
  RunningJob submitJobInternal(JobConf job
                               ) throws FileNotFoundException, 
                                        ClassNotFoundException,
                                        InterruptedException,
                                        IOException {
    /*
     * configure the command line options correctly on the submitting dfs
     */
    
    JobID jobId = jobSubmitClient.getNewJobId();
    Path submitJobDir = new Path(getSystemDir(), jobId.toString());
    Path submitJarFile = new Path(submitJobDir, "job.jar");
    Path submitSplitFile = new Path(submitJobDir, "job.split");
    configureCommandLineOptions(job, submitJobDir, submitJarFile); //将jar文件拷贝到jobtracker文件系统
    Path submitJobFile = new Path(submitJobDir, "job.xml");
    int reduces = job.getNumReduceTasks();
    JobContext context = new JobContext(job, jobId);
    
    // Check the output specification
    if (reduces == 0 ? job.getUseNewMapper() : job.getUseNewReducer()) {
      org.apache.hadoop.mapreduce.OutputFormat<?,?> output =
        ReflectionUtils.newInstance(context.getOutputFormatClass(), job);
      output.checkOutputSpecs(context);   //调用Fileoutputformat的checkOutputSpecs方法,如果输出目录未指定,或已经存在报异常
    } else {
      job.getOutputFormat().checkOutputSpecs(fs, job);
    }

    // Create the splits for the job
    LOG.debug("Creating splits at " + fs.makeQualified(submitSplitFile));
    int maps;
    if (job.getUseNewMapper()) {  //区分是老的mapreduce api 还是新的mapreduce api
      maps = writeNewSplits(context, submitSplitFile); //生成分片信息并copyjob文件系统

    } else {
      maps = writeOldSplits(job, submitSplitFile);
    }
    job.set("mapred.job.split.file", submitSplitFile.toString());
    job.setNumMapTasks(maps);
        
    // Write job file to JobTracker's fs        
    FSDataOutputStream out = 
      FileSystem.create(fs, submitJobFile,
                        new FsPermission(JOB_FILE_PERMISSION));

    try {
      job.writeXml(out);

//job.xml copyjobtracker文件系统

} finally { out.close(); } // // Now, actually submit the job (using the submit name) // JobStatus status = jobSubmitClient.submitJob(jobId);

//实际提交job,远程调用jobtrackersubmit方法

if (status != null) { return new NetworkedJob(status); } else { throw new IOException("Could not launch job"); } }

 

writeNewSplits方法

private <T extends org.apache.hadoop.mapreduce.InputSplit> 
  int writeNewSplits(JobContext job, Path submitSplitFile
                     ) throws IOException, InterruptedException, 
                              ClassNotFoundException {
    JobConf conf = job.getJobConf();
    org.apache.hadoop.mapreduce.InputFormat<?,?> input =
      ReflectionUtils.newInstance(job.getInputFormatClass(), job.getJobConf());
    
    List<org.apache.hadoop.mapreduce.InputSplit> splits = input.getSplits(job);

//调用相应的inputformat计算分片信息

T[] array = (T[]) splits.toArray(new org.apache.hadoop.mapreduce.InputSplit[splits.size()]);

//split列表

// sort the splits into order based on size, so that the biggest // go first Arrays.sort(array, new NewSplitComparator()); DataOutputStream out = writeSplitsFileHeader(conf, submitSplitFile, array.length);

//创建split输出目录,返回输出流

try { if (array.length != 0) { DataOutputBuffer buffer = new DataOutputBuffer(); RawSplit rawSplit = new RawSplit(); SerializationFactory factory = new SerializationFactory(conf); Serializer<T> serializer = factory.getSerializer((Class<T>) array[0].getClass()); serializer.open(buffer); for(T split: array) { rawSplit.setClassName(split.getClass().getName()); buffer.reset(); serializer.serialize(split);//序列化split数据到buffer,最终调用的是org.apache.hadoop.io.serialize.WritableSerialization中的serialize方法 rawSplit.setDataLength(split.getLength()); rawSplit.setBytes(buffer.getData(), 0, buffer.getLength());

 //buffer数据赋值到rawSplit

rawSplit.setLocations(split.getLocations()); rawSplit.write(out);

//写出rawsplit数据至jobtracker

} serializer.close(); } } finally { out.close(); } return array.length; }



 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值