MapTask执行流程

最新推荐文章于 2022-05-16 19:56:06 发布

weixin_34315665

最新推荐文章于 2022-05-16 19:56:06 发布

阅读量455

点赞数

文章标签：大数据 python

原文链接：https://my.oschina.net/yulongblog/blog/1505170

版权

2019独角兽企业重金招聘Python工程师标准>>>

MapTask任务可以分解成Read、Map、Collect、Spill、Merge(Combine)五个阶段。

Read阶段：MapTask通过用户编写的RecordReader，从输入InputSplit中解析出一个个key/value。
Map阶段：该阶段主要是将解析出的key/value交给用户编写的map()函数处理，并产生一系列新的key/value。
Collect阶段：在用户编写的map()函数中，当数据处理完成后，一般会调用OutputCollector.collect()输出结果。在该函数内部，它会将生成的key/value分片(通过调用Partitioner),并写入一个环形内存缓冲区内。
Spill阶段：即“溢写”，当环形缓冲区满后，MapReduce会将数据写到本地磁盘上，生成一个临时文件，需要注意的是，将数据写入本地磁盘之前，先要对数据进行一次本地排序，并在必要时对数据进行合并、压缩等操作。
Merge(Combine)阶段：当所有数据处理完成后，Map Task对所有临时文件进行一次合并，以确保最终只会生成一个数据文件。

org.apache.hadoop.mapred.YarnChild的main方法

final Task taskFinal = task;
childUGI.doAs(new PrivilegedExceptionAction<Object>() {
  @Override
  public Object run() throws Exception {
    // use job-specified working directory
    FileSystem.get(job).setWorkingDirectory(job.getWorkingDirectory());
    taskFinal.run(job, umbilical); // run the task//这里调用的是org.apache.hadoop.mapred.Task接口的run方法
    return null;
  }
}

org.apache.hadoop.mapred.Task接口的实现类包括MapTask和ReduceTask。

MapTask里的run方法分析

public void run(final JobConf job, final TaskUmbilicalProtocol umbilical)
  throws IOException, ClassNotFoundException, InterruptedException {
  this.umbilical = umbilical;

  if (isMapTask()) {
    // If there are no reducers then there won't be any sort. Hence the map 
    // phase will govern the entire attempt's progress.
    if (conf.getNumReduceTasks() == 0) {
      mapPhase = getProgress().addPhase("map", 1.0f);
    } else {
      // If there are reducers then the entire attempt's progress will be 
      // split between the map phase (67%) and the sort phase (33%).
      mapPhase = getProgress().addPhase("map", 0.667f);
      sortPhase  = getProgress().addPhase("sort", 0.333f);
    }
  }
  TaskReporter reporter = startReporter(umbilical);
  
  //判断是否使用NewMapper对应参数mapred.mapper.new-api
  boolean useNewApi = job.getUseNewMapper();
  
  //主要设置了task任务临时输出目录FileOutputFormat.setWorkOutputPath
  //和进程树监控ResourceCalculatorProcessTree
  initialize(job, getJobID(), reporter, useNewApi);

  // check if it is a cleanupJobTask
  if (jobCleanup) {
    runJobCleanupTask(umbilical, reporter);//清理任务
    return;
  }
  if (jobSetup) {
    runJobSetupTask(umbilical, reporter);//创建Task临时输出目录
    return;
  }
  if (taskCleanup) {
    runTaskCleanupTask(umbilical, reporter);//关闭Task任务
    return;
  }

  if (useNewApi) {
    runNewMapper(job, splitMetaInfo, umbilical, reporter);//启动NewMapper
  } else {
    runOldMapper(job, splitMetaInfo, umbilical, reporter);//启动OldMapper
  }
  done(umbilical, reporter);//更新counter、state、reporter
}

默认使用runOldMapper方法，通过设置参数mapred.mapper.new-api为true来使用runNewMapper方法。

分析runNewMapper方法

private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewMapper(final JobConf job,
                  final TaskSplitIndex splitIndex,
                  final TaskUmbilicalProtocol umbilical,
                  TaskReporter reporter
                  ) throws IOException, ClassNotFoundException,
                           InterruptedException {
  // make a task context so we can get the classes
  org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
    new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, 
                                                                getTaskID(),
                                                                reporter);
  // make a mapper
  org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
    (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
      ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
  // make the input format
  org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
    (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
      ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
  // rebuild the input split
  org.apache.hadoop.mapreduce.InputSplit split = null;
  split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
      splitIndex.getStartOffset());
  LOG.info("Processing split: " + split);

  org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
    new NewTrackingRecordReader<INKEY,INVALUE>
      (split, inputFormat, reporter, taskContext);//定义RecordReader
  
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
  org.apache.hadoop.mapreduce.RecordWriter output = null;
  
  // get an output object，定义RecordWriter, 这里比较重要的是NewOutputCollector，由NewOutputCollector.write中调用MapOutputBuffer.collect方法来收集Map计算结果，然后调用线程SpillThread实现spill操作
  if (job.getNumReduceTasks() == 0) {//reduce个数为空直接写入HDFS,否则写入临时数据磁盘目录
    output = 
      new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
  } else {
    output = new NewOutputCollector(taskContext, job, umbilical, reporter);
  }

  org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> 
  mapContext = 
    new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), 
        input, output, 
        committer, 
        reporter, split);//定义MapContextImpl

  org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context 
      mapperContext = 
        new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
            mapContext);//将mapContext包装成Mapper类中抽象内部类Context的子类Context

  try {
    //定义数据读入，这里是Read阶段
    input.initialize(split, mapperContext);
    //执行Map计算任务，这里map阶段，Mapper类中run方法调用了map方法，
    //map方法被用户自定义的Map覆写，如果需要写数据时，调用NewOutputCollector.write
    mapper.run(mapperContext);
    mapPhase.complete();
    setPhase(TaskStatus.Phase.SORT);
    statusUpdate(umbilical);
    input.close();
    input = null;
    output.close(mapperContext);//这一步为Merge阶段,close调用flush方法，flush方法调用mergeParts方法
    output = null;
  } finally {
    closeQuietly(input);
    closeQuietly(output, mapperContext);
  }
}

接下来我们分析NewOutputCollector

private class NewOutputCollector<K,V>
  extends org.apache.hadoop.mapreduce.RecordWriter<K,V> {
  private final MapOutputCollector<K,V> collector;//MapOutputCollector
  private final org.apache.hadoop.mapreduce.Partitioner<K,V> partitioner;
  private final int partitions;

  @SuppressWarnings("unchecked")
  NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,
                     JobConf job,
                     TaskUmbilicalProtocol umbilical,
                     TaskReporter reporter
                     ) throws IOException, ClassNotFoundException {
    collector = createSortingCollector(job
            , reporter);
    partitions = jobContext.getNumReduceTasks();
    //获取partitioner 
    if (partitions > 1) {
      partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)
        ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
    } else {
      partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {
        @Override
        public int getPartition(K key, V value, int numPartitions) {
          return partitions - 1;
        }
      };
    }
  }

  @Override
  public void write(K key, V value) throws IOException, InterruptedException {
  //调用MapOutputCollector接口的实现类MapTask.MapOutputBuffer.collect()，这个阶段是collect阶段
    collector.collect(key, value,
                      partitioner.getPartition(key, value, partitions));
  }

  @Override
  public void close(TaskAttemptContext context
                    ) throws IOException,InterruptedException {
    try {
      collector.flush();
    } catch (ClassNotFoundException cnf) {
      throw new IOException("can't find class ", cnf);
    }
    collector.close();
  }
}

接着分析MapTask.MapOutputBuffer的collect方法

/**
 * Serialize the key, value to intermediate storage.
 * When this method returns, kvindex must refer to sufficient unused
 * storage to store one METADATA.
 */
public synchronized void collect(K key, V value, final int partition
                                 ) throws IOException {
  reporter.progress();
  if (key.getClass() != keyClass) {
    throw new IOException("Type mismatch in key from map: expected "
                          + keyClass.getName() + ", received "
                          + key.getClass().getName());
  }
  if (value.getClass() != valClass) {
    throw new IOException("Type mismatch in value from map: expected "
                          + valClass.getName() + ", received "
                          + value.getClass().getName());
  }
  if (partition < 0 || partition >= partitions) {
    throw new IOException("Illegal partition for " + key + " (" +
        partition + ")");
  }
  checkSpillException();
  bufferRemaining -= METASIZE;
  if (bufferRemaining <= 0) {
    // start spill if the thread is not running and the soft limit has been
    // reached
    spillLock.lock();
    try {
      do {
        if (!spillInProgress) {
          final int kvbidx = 4 * kvindex;
          final int kvbend = 4 * kvend;
          // serialized, unspilled bytes always lie between kvindex and
          // bufindex, crossing the equator. Note that any void space
          // created by a reset must be included in "used" bytes
          final int bUsed = distanceTo(kvbidx, bufindex);
          final boolean bufsoftlimit = bUsed >= softLimit;
          if ((kvbend + METASIZE) % kvbuffer.length !=
              equator - (equator % METASIZE)) {
            // spill finished, reclaim space
            resetSpill();
            bufferRemaining = Math.min(
                distanceTo(bufindex, kvbidx) - 2 * METASIZE,
                softLimit - bUsed) - METASIZE;
            continue;
          } else if (bufsoftlimit && kvindex != kvend) {
            // spill records, if any collected; check latter, as it may
            // be possible for metadata alignment to hit spill pcnt
            startSpill();
            final int avgRec = (int)
              (mapOutputByteCounter.getCounter() /
              mapOutputRecordCounter.getCounter());
            // leave at least half the split buffer for serialization data
            // ensure that kvindex >= bufindex
            final int distkvi = distanceTo(bufindex, kvbidx);
            final int newPos = (bufindex +
              Math.max(2 * METASIZE - 1,
                      Math.min(distkvi / 2,
                               distkvi / (METASIZE + avgRec) * METASIZE)))
              % kvbuffer.length;
            setEquator(newPos);
            bufmark = bufindex = newPos;
            final int serBound = 4 * kvend;
            // bytes remaining before the lock must be held and limits
            // checked is the minimum of three arcs: the metadata space, the
            // serialization space, and the soft limit
            bufferRemaining = Math.min(
                // metadata max
                distanceTo(bufend, newPos),
                Math.min(
                  // serialization max
                  distanceTo(newPos, serBound),
                  // soft limit
                  softLimit)) - 2 * METASIZE;
          }
        }
      } while (false);
    } finally {
      spillLock.unlock();
    }
  }

  try {
    // serialize key bytes into buffer
    int keystart = bufindex;
    keySerializer.serialize(key);
    if (bufindex < keystart) {
      // wrapped the key; must make contiguous
      bb.shiftBufferedKey();
      keystart = 0;
    }
    // serialize value bytes into buffer
    final int valstart = bufindex;
    valSerializer.serialize(value);
    // It's possible for records to have zero length, i.e. the serializer
    // will perform no writes. To ensure that the boundary conditions are
    // checked and that the kvindex invariant is maintained, perform a
    // zero-length write into the buffer. The logic monitoring this could be
    // moved into collect, but this is cleaner and inexpensive. For now, it
    // is acceptable.
    bb.write(b0, 0, 0);

    // the record must be marked after the preceding write, as the metadata
    // for this record are not yet written
    int valend = bb.markRecord();

    mapOutputRecordCounter.increment(1);
    mapOutputByteCounter.increment(
        distanceTo(keystart, valend, bufvoid));

    // write accounting info
    kvmeta.put(kvindex + PARTITION, partition);
    kvmeta.put(kvindex + KEYSTART, keystart);
    kvmeta.put(kvindex + VALSTART, valstart);
    kvmeta.put(kvindex + VALLEN, distanceTo(valstart, valend));
    // advance kvindex
    kvindex = (kvindex - NMETA + kvmeta.capacity()) % kvmeta.capacity();
  } catch (MapBufferTooSmallException e) {
    LOG.info("Record too large for in-memory buffer: " + e.getMessage());
    spillSingleRecord(key, value, partition);
    mapOutputRecordCounter.increment(1);
    return;
  }
}

MapTask.MapOutputBuffer静态类初始化时会创建SpillThread线程，分析SpillThread类的run()方法中又调用了sortAndSpill()方法.

protected class SpillThread extends Thread {

  @Override
  public void run() {
    spillLock.lock();
    spillThreadRunning = true;
    try {
      while (true) {
        spillDone.signal();
        while (!spillInProgress) {
          spillReady.await();
        }
        try {
          spillLock.unlock();
          sortAndSpill();//这里是spill阶段。
        } catch (Throwable t) {
          sortSpillException = t;
        } finally {
          spillLock.lock();
          if (bufend < bufstart) {
            bufvoid = kvbuffer.length;
          }
          kvstart = kvend;
          bufstart = bufend;
          spillInProgress = false;
        }
      }
    } catch (InterruptedException e) {
      Thread.currentThread().interrupt();
    } finally {
      spillLock.unlock();
      spillThreadRunning = false;
    }
  }
}

接着分析sortAndSpill()方法

private void sortAndSpill() throws IOException, ClassNotFoundException,
                                   InterruptedException {
  //approximate the length of the output file to be the length of the
  //buffer + header lengths for the partitions
  final long size = distanceTo(bufstart, bufend, bufvoid) +
              partitions * APPROX_HEADER_LENGTH;
  FSDataOutputStream out = null;
  try {
    // create spill file
    final SpillRecord spillRec = new SpillRecord(partitions);
    final Path filename =
        mapOutputFile.getSpillFileForWrite(numSpills, size);
    out = rfs.create(filename);

    final int mstart = kvend / NMETA;
    final int mend = 1 + // kvend is a valid record
      (kvstart >= kvend
      ? kvstart
      : kvmeta.capacity() + kvstart) / NMETA;
    sorter.sort(MapOutputBuffer.this, mstart, mend, reporter);
    int spindex = mstart;
    final IndexRecord rec = new IndexRecord();
    final InMemValBytes value = new InMemValBytes();
    for (int i = 0; i < partitions; ++i) {
      IFile.Writer<K, V> writer = null;
      try {
        long segmentStart = out.getPos();
        FSDataOutputStream partitionOut = CryptoUtils.wrapIfNecessary(job, out);
        writer = new Writer<K, V>(job, partitionOut, keyClass, valClass, codec,
                                  spilledRecordsCounter);
//通过CombinerRunner抽象类的create方法判断是否设置了combiner,这里使用JobContext.getCombinerClass()
        if (combinerRunner == null) {
          // spill directly
          DataInputBuffer key = new DataInputBuffer();
          while (spindex < mend &&
              kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) {
            final int kvoff = offsetFor(spindex % maxRec);
            int keystart = kvmeta.get(kvoff + KEYSTART);
            int valstart = kvmeta.get(kvoff + VALSTART);
            key.reset(kvbuffer, keystart, valstart - keystart);
            getVBytesForOffset(kvoff, value);
            writer.append(key, value);
            ++spindex;
          }
        } else {
          int spstart = spindex;
          while (spindex < mend &&
              kvmeta.get(offsetFor(spindex % maxRec)
                        + PARTITION) == i) {
            ++spindex;
          }
          // Note: we would like to avoid the combiner if we've fewer
          // than some threshold of records for a partition
          if (spstart != spindex) {
            combineCollector.setWriter(writer);//调用IFile.writer
            RawKeyValueIterator kvIter =
              new MRResultIterator(spstart, spindex);
            combinerRunner.combine(kvIter, combineCollector);//这里调用combine操作NewCombinerRunner.combine
          }
        }

        // close the writer
        writer.close();

        // record offsets
        rec.startOffset = segmentStart;
        rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job);
        rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job);
        spillRec.putIndex(rec, i);

        writer = null;
      } finally {
        if (null != writer) writer.close();
      }
    }

    if (totalIndexCacheMemory >= indexCacheMemoryLimit) {
      // create spill index file
      Path indexFilename =
          mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions
              * MAP_OUTPUT_INDEX_RECORD_LENGTH);
      spillRec.writeToFile(indexFilename, job);
    } else {
      indexCacheList.add(spillRec);
      totalIndexCacheMemory +=
        spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
    }
    LOG.info("Finished spill " + numSpills);
    ++numSpills;
  } finally {
    if (out != null) out.close();
  }
}

分析NewCombinerRunner及其combine方法

protected static class NewCombinerRunner<K, V> extends CombinerRunner<K,V> {
  private final Class<? extends org.apache.hadoop.mapreduce.Reducer<K,V,K,V>> 
      reducerClass;
  private final org.apache.hadoop.mapreduce.TaskAttemptID taskId;
  private final RawComparator<K> comparator;
  private final Class<K> keyClass;
  private final Class<V> valueClass;
  private final org.apache.hadoop.mapreduce.OutputCommitter committer;

  @SuppressWarnings("unchecked")
  NewCombinerRunner(Class reducerClass,
                    JobConf job,
                    org.apache.hadoop.mapreduce.TaskAttemptID taskId,
                    org.apache.hadoop.mapreduce.TaskAttemptContext context,
                    Counters.Counter inputCounter,
                    TaskReporter reporter,
                    org.apache.hadoop.mapreduce.OutputCommitter committer) {
    super(inputCounter, job, reporter);
    this.reducerClass = reducerClass;
    this.taskId = taskId;
    keyClass = (Class<K>) context.getMapOutputKeyClass();
    valueClass = (Class<V>) context.getMapOutputValueClass();
    comparator = (RawComparator<K>) context.getCombinerKeyGroupingComparator();
    this.committer = committer;
  }

  private static class OutputConverter<K,V>
          extends org.apache.hadoop.mapreduce.RecordWriter<K,V> {
    OutputCollector<K,V> output;
    OutputConverter(OutputCollector<K,V> output) {
      this.output = output;
    }

    @Override
    public void close(org.apache.hadoop.mapreduce.TaskAttemptContext context){
    }

    @Override
    public void write(K key, V value
                      ) throws IOException, InterruptedException {
      output.collect(key,value);
    }
  }

  @SuppressWarnings("unchecked")
  @Override
  public void combine(RawKeyValueIterator iterator, 
               OutputCollector<K,V> collector
               ) throws IOException, InterruptedException,
                        ClassNotFoundException {
    // make a reducer
    org.apache.hadoop.mapreduce.Reducer<K,V,K,V> reducer =
      (org.apache.hadoop.mapreduce.Reducer<K,V,K,V>)
        ReflectionUtils.newInstance(reducerClass, job);
    org.apache.hadoop.mapreduce.Reducer.Context 
         reducerContext = createReduceContext(reducer, job, taskId,
                                              iterator, null, inputCounter, 
                                              new OutputConverter(collector),
                                              committer,
                                              reporter, comparator, keyClass,
                                              valueClass);
    reducer.run(reducerContext);
  } 
}

分析MapOutputBuffer.mergeParts方法

private void mergeParts() throws IOException, InterruptedException, 
                                 ClassNotFoundException {
  // get the approximate size of the final output/index files
  long finalOutFileSize = 0;
  long finalIndexFileSize = 0;
  final Path[] filename = new Path[numSpills];
  final TaskAttemptID mapId = getTaskID();

  for(int i = 0; i < numSpills; i++) {
    filename[i] = mapOutputFile.getSpillFile(i);
    finalOutFileSize += rfs.getFileStatus(filename[i]).getLen();
  }
  if (numSpills == 1) { //the spill is the final output
    sameVolRename(filename[0],
        mapOutputFile.getOutputFileForWriteInVolume(filename[0]));
    if (indexCacheList.size() == 0) {
      sameVolRename(mapOutputFile.getSpillIndexFile(0),
        mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]));
    } else {
      indexCacheList.get(0).writeToFile(
        mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]), job);
    }
    sortPhase.complete();
    return;
  }

  // read in paged indices
  for (int i = indexCacheList.size(); i < numSpills; ++i) {
    Path indexFileName = mapOutputFile.getSpillIndexFile(i);
    indexCacheList.add(new SpillRecord(indexFileName, job));
  }

  //make correction in the length to include the sequence file header
  //lengths for each partition
  finalOutFileSize += partitions * APPROX_HEADER_LENGTH;
  finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH;
  Path finalOutputFile =
      mapOutputFile.getOutputFileForWrite(finalOutFileSize);
  Path finalIndexFile =
      mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize);

  //The output stream for the final single output file
  FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);

  if (numSpills == 0) {
    //create dummy files
    IndexRecord rec = new IndexRecord();
    SpillRecord sr = new SpillRecord(partitions);
    try {
      for (int i = 0; i < partitions; i++) {
        long segmentStart = finalOut.getPos();
        FSDataOutputStream finalPartitionOut = CryptoUtils.wrapIfNecessary(job, finalOut);
        Writer<K, V> writer =
          new Writer<K, V>(job, finalPartitionOut, keyClass, valClass, codec, null);
        writer.close();
        rec.startOffset = segmentStart;
        rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job);
        rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job);
        sr.putIndex(rec, i);
      }
      sr.writeToFile(finalIndexFile, job);
    } finally {
      finalOut.close();
    }
    sortPhase.complete();
    return;
  }
  {
    sortPhase.addPhases(partitions); // Divide sort phase into sub-phases
    
    IndexRecord rec = new IndexRecord();
    final SpillRecord spillRec = new SpillRecord(partitions);
    for (int parts = 0; parts < partitions; parts++) {
      //create the segments to be merged
      List<Segment<K,V>> segmentList =
        new ArrayList<Segment<K, V>>(numSpills);
      for(int i = 0; i < numSpills; i++) {
        IndexRecord indexRecord = indexCacheList.get(i).getIndex(parts);

        Segment<K,V> s =
          new Segment<K,V>(job, rfs, filename[i], indexRecord.startOffset,
                           indexRecord.partLength, codec, true);
        segmentList.add(i, s);

        if (LOG.isDebugEnabled()) {
          LOG.debug("MapId=" + mapId + " Reducer=" + parts +
              "Spill =" + i + "(" + indexRecord.startOffset + "," +
              indexRecord.rawLength + ", " + indexRecord.partLength + ")");
        }
      }

      int mergeFactor = job.getInt(JobContext.IO_SORT_FACTOR, 100);//参数设置mapreduce.task.io.sort.factor
      // sort the segments only if there are intermediate merges
      boolean sortSegments = segmentList.size() > mergeFactor;
      //merge
      @SuppressWarnings("unchecked")
      RawKeyValueIterator kvIter = Merger.merge(job, rfs,
                     keyClass, valClass, codec,
                     segmentList, mergeFactor,
                     new Path(mapId.toString()),
                     job.getOutputKeyComparator(), reporter, sortSegments,
                     null, spilledRecordsCounter, sortPhase.phase(),
                     TaskType.MAP);

      //write merged output to disk
      long segmentStart = finalOut.getPos();
      FSDataOutputStream finalPartitionOut = CryptoUtils.wrapIfNecessary(job, finalOut);
      Writer<K, V> writer =
          new Writer<K, V>(job, finalPartitionOut, keyClass, valClass, codec,
                           spilledRecordsCounter);
      if (combinerRunner == null || numSpills < minSpillsForCombine) {
        Merger.writeFile(kvIter, writer, reporter, job);
      } else {
        combineCollector.setWriter(writer);
        combinerRunner.combine(kvIter, combineCollector);
      }

      //close
      writer.close();

      sortPhase.startNextPhase();
      
      // record offsets
      rec.startOffset = segmentStart;
      rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job);
      rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job);
      spillRec.putIndex(rec, parts);
    }
    spillRec.writeToFile(finalIndexFile, job);
    finalOut.close();
    for(int i = 0; i < numSpills; i++) {
      rfs.delete(filename[i],true);
    }
  }
}

转载于:https://my.oschina.net/yulongblog/blog/1505170