HDFS源码之MapReduce执行流程

最新推荐文章于 2022-11-20 18:35:22 发布

地球人是我哈

最新推荐文章于 2022-11-20 18:35:22 发布

阅读量184

点赞数

分类专栏： hadoop

本文链接：https://blog.csdn.net/weixin_44865574/article/details/110083514

版权

hadoop 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

HDFS源码之MapReduce执行流程

1. Driver（mp程序启动driver）

public static void main(String[] args) throws Exception {
        // 输入输出路径需要根据自己电脑上实际的输入输出路径设置
        args = new String[] { "D:\\git\\study\\BigDataPro\\hadoop\\src\\main\\resources\\input", "D:\\git\\study\\BigDataPro\\hadoop\\src\\main\\resources\\udoutput" };

        // 1 获取job对象
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        // 2 设置jar包存储位置、关联自定义的mapper和reducer
        job.setJarByClass(WholeDriver.class);
        job.setMapperClass(WholeMapper.class);
        job.setReducerClass(WholeReducer.class);

        // 3 设置map输出端的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);

        // 4 设置最终输出端的kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);

        // 5 设置输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 7设置输入的inputFormat
        job.setInputFormatClass(WholeFileInputformat.class);

        // 8设置输出的outputFormat
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        // 6 提交job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }

2. waitForCompletion

//org.apache.hadoop.mapreduce.Job#waitForCompletion
public boolean waitForCompletion(boolean verbose ){
  if (state == JobState.DEFINE) {
    submit();
  }
  if (verbose) {
    monitorAndPrintJob();
  } else {
    // get the completion poll interval from the client.
    int completionPollIntervalMillis = 
      Job.getCompletionPollInterval(cluster.getConf());
    while (!isComplete()) {
      try {
        Thread.sleep(completionPollIntervalMillis);
      } catch (InterruptedException ie) {
      }
    }
  }
  return isSuccessful();
}

3. submit

//org.apache.hadoop.mapreduce.Job#submit
public void submit() {
  ensureState(JobState.DEFINE);
  setUseNewAPI();
  connect();
  //判断当前模式是yarn，还是本地,并且将submitter传入下一个方法,调用
  final JobSubmitter submitter = getJobSubmitter(cluster.getFileSystem(), cluster.getClient());
  status = ugi.doAs(new PrivilegedExceptionAction<JobStatus>() {
    public JobStatus run() throws IOException, InterruptedException, 
    ClassNotFoundException {
      //提交一个job实例
      return submitter.submitJobInternal(Job.this, cluster);
    }
  });
  state = JobState.RUNNING;
  LOG.info("The url to track the job: " + getTrackingURL());
 }

4. submitJobInternal

//org.apache.hadoop.mapreduce.JobSubmitter#submitJobInternal
JobStatus submitJobInternal(Job job, Cluster cluster) {

	//......
    //拷贝配置文件和jar包到yarn(如果是用yarn的方式运行)
    copyAndConfigureFiles(job, submitJobDir);
    
    //获取 切片,具体规则可看:《HDFS源码之MapReduce提交job，split规则》
    // 最终输出的是 切片的个数,也就是后面启动MapTask的个数
    int maps = writeSplits(job, submitJobDir);
    
    //...
    // Write job file to submit dir
    writeConf(conf, submitJobFile);
    
    //...
    //来看这一段代码
    //查看submitJob的实现方法,这里我是在本地跑的，所以看org.apache.hadoop.mapred.LocalJobRunner#submitJob
    status = submitClient.submitJob(jobId, submitJobDir.toString(), job.getCredentials());
   //......
}

5. submitJob

//org.apache.hadoop.mapred.LocalJobRunner#submitJob
public org.apache.hadoop.mapreduce.JobStatus submitJob(jobid, String jobSubmitDir,
  Credentials credentials) throws IOException {
    //实例化一个job对象
	Job job = new Job(JobID.downgrade(jobid), jobSubmitDir);
	job.job.setCredentials(credentials);
	return job.status;
}

6. Job

//org.apache.hadoop.mapred.LocalJobRunner.Job#Job
public Job(JobID jobid, String jobSubmitDir) throws IOException {
  //....
  OutputStream out = localFs.create(localJobFile);
  try {
    //向Stag路径写XML配置文件,Map阶段会用到改xml文件
    conf.writeXml(out);
  } finally {
    out.close();
  }
  
  
  //...
  //因为该Job继承Thread,我们需要找到该类的run()
  this.start();
}

public void run() {
  //实例化 OutputCommitter describes the commit of task output for a  Map-Reduce job.
  org.apache.hadoop.mapreduce.OutputCommitter outputCommitter = null;
  try {
    outputCommitter = createOutputCommitter(conf.getUseNewMapper(), jobId, conf);
  } catch (Exception e) {
    LOG.info("Failed to createOutputCommitter", e);
    return;
  }
  
  try {
    //获取切片信息
    TaskSplitMetaInfo[] taskSplitMetaInfos = 
      SplitMetaInfoReader.readSplitMetaInfo(jobId, localFs, conf, systemJobDir);

    int numReduceTasks = job.getNumReduceTasks();
    outputCommitter.setupJob(jContext);
    status.setSetupProgress(1.0f);

    Map<TaskAttemptID, MapOutputFile> mapOutputFiles =
        Collections.synchronizedMap(new HashMap<TaskAttemptID, MapOutputFile>());
    //根据切片的个数，初始化MapTask的个数
    List<RunnableWithThrowable> mapRunnables = getMapTaskRunnables(
        taskSplitMetaInfos, jobId, mapOutputFiles);
    //初始化MapTask的个数      
    initCounters(mapRunnables.size(), numReduceTasks);
    ExecutorService mapService = createMapExecutor();
    //多线程启动MapTask
    runTasks(mapRunnables, mapService, "map");

    try {
      if (numReduceTasks > 0) {
        //根据Driver端配置的numTask的个数,启动ReduceTask
        List<RunnableWithThrowable> reduceRunnables = getReduceTaskRunnables(
            jobId, mapOutputFiles);
        ExecutorService reduceService = createReduceExecutor();
        //多线程运行ReduceTask，只有MapTask运行结束,才会执行ReduceTask
        runTasks(reduceRunnables, reduceService, "reduce");
      }
    } finally {
      for (MapOutputFile output : mapOutputFiles.values()) {
        output.removeAll();
      }
    }
    
    //MapReduce收尾工作,比如:删除临时文件夹等
}

7. runTasks

//org.apache.hadoop.mapred.LocalJobRunner.Job#runTasks
private void runTasks(List<RunnableWithThrowable> runnables,ExecutorService service, String taskType) throws Exception {
  //RunnableWithThrowable有两个实现类,MapTaskRunnable,ReduceTaskRunnable
  for (Runnable r : runnables) {
    service.submit(r);
  }
}

7.1 MapTaskRunnable

//org.apache.hadoop.mapred.LocalJobRunner.Job.MapTaskRunnable
//直接找run()
public void run() {
  try {
    TaskAttemptID mapId = new TaskAttemptID(new TaskID(
        jobId, TaskType.MAP, taskId), 0);
    LOG.info("Starting task: " + mapId);
    //MapTask的参数封装
    mapIds.add(mapId);
    MapTask map = new MapTask(systemJobFile.toString(), mapId, taskId,
      info.getSplitIndex(), 1);
    map.setUser(UserGroupInformation.getCurrentUser().
        getShortUserName());
    setupChildMapredLocalDirs(map, localConf);

    MapOutputFile mapOutput = new MROutputFiles();
    mapOutput.setConf(localConf);
    mapOutputFiles.put(mapId, mapOutput);

    map.setJobFile(localJobFile.toString());
    localConf.setUser(map.getUser());
    map.localizeConfiguration(localConf);
    map.setConf(localConf);
    try {
      map_tasks.getAndIncrement();
      myMetrics.launchMap(mapId);
      //主要看这个这个方法
      map.run(localConf, Job.this);
      myMetrics.completeMap(mapId);
    } finally {
      map_tasks.getAndDecrement();
    }

    LOG.info("Finishing task: " + mapId);
  } catch (Throwable e) {
    this.storedException = e;
  }
}

7.1.2 run

//org.apache.hadoop.mapred.MapTask#run
@Override
public void run(final JobConf job, final TaskUmbilicalProtocol umbilical) {
  this.umbilical = umbilical;

  //参数封装,配置

  if (useNewApi) {
    //主要看该方法
    runNewMapper(job, splitMetaInfo, umbilical, reporter);
  } else {
    runOldMapper(job, splitMetaInfo, umbilical, reporter);
  }
  done(umbilical, reporter);
}

7.1.3 runNewMapper

//org.apache.hadoop.mapred.MapTask#runNewMapper
private <INKEY,INVALUE,OUTKEY,OUTVALUE> void runNewMapper(final JobConf job,final TaskSplitIndex splitIndex,final TaskUmbilicalProtocol umbilical,TaskReporter reporter) throws IOException, ClassNotFoundException,InterruptedException {
  //实例化上下文对象，根据上下文对象，获取Classes
  org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
    new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, 
                                                                getTaskID(),
                                                                reporter);
  // 使用反射机制,构建Mapper对象
  org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
    (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
      ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
  // 构建inputFormat,用于读取切片数据
  org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
    (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
      ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
  // 重编译切片
  org.apache.hadoop.mapreduce.InputSplit split = null;
  split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
      splitIndex.getStartOffset());
  
  //实例化RecordReader，inputformat使用该类的实现类去读取切片信息
  org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
    new NewTrackingRecordReader<INKEY,INVALUE>
      (split, inputFormat, reporter, taskContext);
  
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
  org.apache.hadoop.mapreduce.RecordWriter output = null;
  
  
  if (job.getNumReduceTasks() == 0) {
    output = 
      // 将数据直接落地到HDFS
      new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
  } else {
    //7.3.1 看一下该方法,MapTask使用该类输出数据
    output = new NewOutputCollector(taskContext, job, umbilical, reporter);
  }

  org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> 
  mapContext = 
    new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), 
        input, output, 
        committer, 
        reporter, split);

  org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context 
      mapperContext = 
        new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
            mapContext);

  try {
    input.initialize(split, mapperContext);
    //7.3.2 直接看Run方法
    mapper.run(mapperContext);
    mapPhase.complete();
    setPhase(TaskStatus.Phase.SORT);
    statusUpdate(umbilical);
    input.close();
    input = null;
    output.close(mapperContext);
    output = null;
  } finally {
    closeQuietly(input);
    closeQuietly(output, mapperContext);
  }
}

7.1.3.1 NewOutputCollector

//org.apache.hadoop.mapred.MapTask.NewOutputCollector#NewOutputCollector
NewOutputCollector(JobContext jobContext,JobConf job,TaskUmbilicalProtocol umbilical, TaskReporter reporter) throws IOException, ClassNotFoundException {
  collector = createSortingCollector(job, reporter);
  partitions = jobContext.getNumReduceTasks();
  if (partitions > 1) {
    //主要是通过反射,实例化Partition分区对象
    partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)
      ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
  } else {
    partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {
      @Override
      public int getPartition(K key, V value, int numPartitions) {
        return partitions - 1;
      }
    };
  }
}

7.1.3.2 run

//org.apache.hadoop.mapreduce.Mapper#run
public void run(Context context) throws IOException, InterruptedException {
  //自己重写setup方法，获取context这个参数；然后循环调用nextKeyValue()方法获取key和value，执行map方法。
  setup(context);
  try {
    while (context.nextKeyValue()) {
      //执行Map方法
      //map目前西安看 context
      map(context.getCurrentKey(), context.getCurrentValue(), context);
    }
  } finally {
    cleanup(context);
  }
}

7.1.3.2.1 context

//org.apache.hadoop.mapreduce.lib.map.WrappedMapper.Context
//Context 是 Mapper 的内部类,Context内部实现了很多方法,我们说一个最主要的，write()
public class Context extends Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context {

  protected MapContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> mapContext;

  public Context(MapContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> mapContext) {
    this.mapContext = mapContext;
  }

  /**
   * Get the input split for this map.
   */
  public InputSplit getInputSplit() {
    return mapContext.getInputSplit();
  }

  @Override
  public KEYIN getCurrentKey() throws IOException, InterruptedException {
    return mapContext.getCurrentKey();
  }

  @Override
  public VALUEIN getCurrentValue() throws IOException, InterruptedException {
    return mapContext.getCurrentValue();
  }

  @Override
  public boolean nextKeyValue() throws IOException, InterruptedException {
    return mapContext.nextKeyValue();
  }


  @Override
  public void write(KEYOUT key, VALUEOUT value) throws IOException,
      InterruptedException {
    //该mapContext 是之前传过来的，实现类为:NewOutputCollector，所以我们需要去看NewOutputCollector 的write方法
    mapContext.write(key, value);
  }
}

7.1.3.2.2 write

//org.apache.hadoop.mapred.MapTask.NewOutputCollector#write
//看一下 collector 哪个对象,看其实现类: MapOutputBuffer
//所以最终，我们要看的是:org.apache.hadoop.mapred.MapTask.MapOutputBuffer#collect
@Override
public void write(K key, V value) throws IOException, InterruptedException {
   collector.collect(key, value,partitioner.getPartition(key, value, partitions));
}

@Override
public void close(TaskAttemptContext context) throws IOException,InterruptedException {
  try {
    //写完之后，一定会执行close，此时，便去执行flush
    collector.flush();
  } catch (ClassNotFoundException cnf) {
    throw new IOException("can't find class ", cnf);
  }
  collector.close();
}

7.1.3.2.3 collect

//org.apache.hadoop.mapred.MapTask.MapOutputBuffer#collect
public synchronized void collect(K key, V value, final int partition) throws IOException {
  //.....
  if (bufferRemaining <= 0) {
    // start spill if the thread is not running and the soft limit has been
    // reached
    spillLock.lock();
    try {
      do {
        if (!spillInProgress) {
          final int kvbidx = 4 * kvindex;
          final int kvbend = 4 * kvend;
          final int bUsed = distanceTo(kvbidx, bufindex);
          final boolean bufsoftlimit = bUsed >= softLimit;
          //将数据,想写入环形缓冲区
          if ((kvbend + METASIZE) % kvbuffer.length !=
              equator - (equator % METASIZE)) {
            // spill finished, reclaim space
            resetSpill();
            bufferRemaining = Math.min(
                distanceTo(bufindex, kvbidx) - 2 * METASIZE,
                softLimit - bUsed) - METASIZE;
            continue;
          } else if (bufsoftlimit && kvindex != kvend) {
            //当环形缓冲区的数据量达到一定的阈值，默认是80%,将其溢写到磁盘
            startSpill();
            final int avgRec = (int)
              (mapOutputByteCounter.getCounter() /
              mapOutputRecordCounter.getCounter());
            // leave at least half the split buffer for serialization data
            // ensure that kvindex >= bufindex
            final int distkvi = distanceTo(bufindex, kvbidx);
            final int newPos = (bufindex +
              Math.max(2 * METASIZE - 1,
                      Math.min(distkvi / 2,
                               distkvi / (METASIZE + avgRec) * METASIZE)))
              % kvbuffer.length;
            setEquator(newPos);
            bufmark = bufindex = newPos;
            final int serBound = 4 * kvend
            bufferRemaining = Math.min(
                // metadata max
                distanceTo(bufend, newPos),
                Math.min(
                  // serialization max
                  distanceTo(newPos, serBound),
                  // soft limit
                  softLimit)) - 2 * METASIZE;
          }
        }
      } while (false);
    } finally {
      spillLock.unlock();
    }
  }
  // ....
}

7.1.3.2.4 flush

//org.apache.hadoop.mapred.MapTask.MapOutputBuffer#flush
public void flush() throws IOException, ClassNotFoundException,InterruptedException {
  //...
  //正式溢写之前,执行该方法,先排序，再溢出（默认使用快速排序）
  sortAndSpill();
  //...

  //合并partition 归并排序
  mergeParts();
  
  //...
}

//等数据落地后，进行Combine操作

7.2 ReduceTaskRunnable

//org.apache.hadoop.mapred.LocalJobRunner.Job.ReduceTaskRunnable
//直接找到run方法
public void run() {
    try {
      //参数封装
      //...
        
          reduce.run(localConf, Job.this);
      //.....
}

7.2.1 run

//org.apache.hadoop.mapred.ReduceTask#run
public void run(JobConf job, final TaskUmbilicalProtocol umbilical){
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());

  if (isMapOrReduce()) {
    copyPhase = getProgress().addPhase("copy");
    sortPhase  = getProgress().addPhase("sort");
    reducePhase = getProgress().addPhase("reduce");
  }
  // start thread that will handle communication with parent
  TaskReporter reporter = startReporter(umbilical);
  
  boolean useNewApi = job.getUseNewReducer();
  initialize(job, getJobID(), reporter, useNewApi);

  // check if it is a cleanupJobTask
  if (jobCleanup) {
    runJobCleanupTask(umbilical, reporter);
    return;
  }
  if (jobSetup) {
    runJobSetupTask(umbilical, reporter);
    return;
  }
  if (taskCleanup) {
    runTaskCleanupTask(umbilical, reporter);
    return;
  }
  
  // Initialize the codec
  codec = initCodec();
  RawKeyValueIterator rIter = null;
  ShuffleConsumerPlugin shuffleConsumerPlugin = null;
  
  Class combinerClass = conf.getCombinerClass();
  CombineOutputCollector combineCollector = 
    (null != combinerClass) ? 
   new CombineOutputCollector(reduceCombineOutputCounter, reporter, conf) : null;

  Class<? extends ShuffleConsumerPlugin> clazz =
        job.getClass(MRConfig.SHUFFLE_CONSUMER_PLUGIN, Shuffle.class, ShuffleConsumerPlugin.class);
        
  shuffleConsumerPlugin = ReflectionUtils.newInstance(clazz, job);
  LOG.info("Using ShuffleConsumerPlugin: " + shuffleConsumerPlugin);

  ShuffleConsumerPlugin.Context shuffleContext = 
    new ShuffleConsumerPlugin.Context(getTaskID(), job, FileSystem.getLocal(job), umbilical, 
                super.lDirAlloc, reporter, codec, 
                combinerClass, combineCollector, 
                spilledRecordsCounter, reduceCombineInputCounter,
                shuffledMapsCounter,
                reduceShuffleBytes, failedShuffleCounter,
                mergedMapOutputsCounter,
                taskStatus, copyPhase, sortPhase, this,
                mapOutputFile, localMapFiles);
  shuffleConsumerPlugin.init(shuffleContext);

  rIter = shuffleConsumerPlugin.run();

  // free up the data structures
  mapOutputFilesOnDisk.clear();
  
  sortPhase.complete();                         // sort is complete
  setPhase(TaskStatus.Phase.REDUCE); 
  statusUpdate(umbilical);
  Class keyClass = job.getMapOutputKeyClass();
  Class valueClass = job.getMapOutputValueClass();
  RawComparator comparator = job.getOutputValueGroupingComparator();

  if (useNewApi) {
    runNewReducer(job, umbilical, reporter, rIter, comparator,keyClass, valueClass);
  } else {
    runOldReducer(job, umbilical, reporter, rIter, comparator,keyClass, valueClass);
  }

  shuffleConsumerPlugin.close();
  done(umbilical, reporter);
}

7.2.2 runNewReducer

//
private <INKEY,INVALUE,OUTKEY,OUTVALUE> void runNewReducer(JobConf job,
                   final TaskUmbilicalProtocol umbilical,
                   final TaskReporter reporter,
                   RawKeyValueIterator rIter,
                   RawComparator<INKEY> comparator,
                   Class<INKEY> keyClass,
                   Class<INVALUE> valueClass
                   ) throws IOException,InterruptedException, 
                            ClassNotFoundException {
  // ....
  
  // 实例化 taskContext
  org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
    new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,
        getTaskID(), reporter);
  // 实例化 reducer
  org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE> reducer =
    (org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE>)
      ReflectionUtils.newInstance(taskContext.getReducerClass(), job);
  org.apache.hadoop.mapreduce.RecordWriter<OUTKEY,OUTVALUE> trackedRW = 
    new NewTrackingRecordWriter<OUTKEY, OUTVALUE>(this, taskContext);
  job.setBoolean("mapred.skip.on", isSkipping());
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
  org.apache.hadoop.mapreduce.Reducer.Context 
       reducerContext = createReduceContext(reducer, job, getTaskID(),
                                             rIter, reduceInputKeyCounter, 
                                             reduceInputValueCounter, 
                                             trackedRW,
                                             committer,
                                             reporter, comparator, keyClass,
                                             valueClass);
  try {
    //查看run方法实现类
    reducer.run(reducerContext);
  } finally {
    trackedRW.close(reducerContext);
  }
}

7.2.2.1 run

//org.apache.hadoop.mapreduce.Reducer#run
public void run(Context context) throws IOException, InterruptedException {
  setup(context);
  try {
    while (context.nextKey()) {
      //执行reduce, 查看reduce的实现类,比如说：
      reduce(context.getCurrentKey(), context.getValues(), context);
      // If a back up store is used, reset it
      Iterator<VALUEIN> iter = context.getValues().iterator();
      if(iter instanceof ReduceContext.ValueIterator) {
        ((ReduceContext.ValueIterator<VALUEIN>)iter).resetBackupStore();        
      }
    }
  } finally {
    cleanup(context);
  }
}

//自定义Reduce
public class MR2_Reducer extends Reducer<Text, MR2_Writable, Text, MR2_Writable> {

    @Override
    protected void reduce(Text key, Iterable<MR2_Writable> values, Context context) throws IOException, InterruptedException {

        long sum_upFlow = 0;
        long sum_downFlow = 0;

        // 1 遍历所用bean，将其中的上行流量，下行流量分别累加
        for (MR2_Writable flowBean : values) {
            sum_upFlow += flowBean.getUpFlow();
            sum_downFlow += flowBean.getDownFlow();
        }

        // 2 封装对象
        MR2_Writable resultBean = new MR2_Writable(sum_upFlow, sum_downFlow);
        // 3 写出
        //通过解析后，最终走context.write,记住,这里的context是 createReduceContext（）之前传过来的，具体看一下write方法
        context.write(key, resultBean);
    }
}

7.2.2.2 write

//TaskInputOutputContext对象是一个借口,具体还得看他的实现类,所以找到TaskInputOutputContextImpl
private RecordWriter<KEYOUT,VALUEOUT> output;

public TaskInputOutputContextImpl(Configuration conf, TaskAttemptID taskid,
                                  RecordWriter<KEYOUT,VALUEOUT> output,
                                  OutputCommitter committer,
                                  StatusReporter reporter) {
  super(conf, taskid, reporter);
  this.output = output;
  this.committer = committer;
}

public void write(KEYOUT key, VALUEOUT value) throws IOException, InterruptedException {
  	//可以看到，这里其实调用的是RecordWriter的write方法,恰好RecordWriter对象时之前在做reduce初始化的时候,这里看一下RecordWriter是怎么来的
    output.write(key, value);
    //write方法的执行需要看对应的RecordWriter
}

7.2.2.2.1 NewTrackingRecordWriter

//org.apache.hadoop.mapred.ReduceTask.NewTrackingRecordWriter#NewTrackingRecordWriter
NewTrackingRecordWriter(ReduceTask reduce,TaskAttemptContext taskContext) {
      this.outputRecordCounter = reduce.reduceOutputCounter;
      this.fileOutputByteCounter = reduce.fileOutputByteCounter;

      List<Statistics> matchedStats = null;
      if (reduce.outputFormat instanceof org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) {
        //
        matchedStats = getFsStatistics(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
            .getOutputPath(taskContext), taskContext.getConfiguration());
      }

      fsStats = matchedStats;

      long bytesOutPrev = getOutputBytes(fsStats);
      this.real = (org.apache.hadoop.mapreduce.RecordWriter<K, V>) reduce.outputFormat
          .getRecordWriter(taskContext);
      long bytesOutCurr = getOutputBytes(fsStats);
      fileOutputByteCounter.increment(bytesOutCurr - bytesOutPrev);
    }

地球人是我哈

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
HDFS源码之MapReduce执行流程

HDFS源码之MapReduce执行流程1. Driver（mp程序启动driver）public static void main(String[] args) throws Exception { // 输入输出路径需要根据自己电脑上实际的输入输出路径设置 args = new String[] { "D:\\git\\study\\BigDataPro\\hadoop\\src\\main\\resources\\input", "D:\\git\\study\\B
复制链接

扫一扫