ReduceTask主要包含5个部分shuffle、Merge、Sort、Reduce、Write五个阶段。
Shuffle阶段:也成为Copy阶段。ReduceTask从各个MapTask上远程拷贝一片数据,并针对某一片数据,如果其大小超过一定阈值,则写到磁盘,否则直接放到内存中。
Merge阶段:在远程拷贝数据的同时,ReduceTask启动了两个后台线程对内存和磁盘上的文件进行合并,以防止内存使用过多或磁盘上文件过多。
Sort阶段:按照MapReduce语义,用户编写的Reduce函数输入数据是按key进行聚集的一组数据。为了将key相同的数据聚在一起,Hadoop采用了基于排序的策略。由于各个MapTask已经实现对自己处理结果进行了局部排序,因此,ReduceTask只需对所有数据进行一次归并排序即可。
Reduce阶段:该阶段,ReduceTask将每组数据一次交给用户编写的reduce()函数处理。
Write阶段:reduce函数将计算结果写入HDFS上。
org.apache.hadoop.mapred.YarnChild的main方法
final Task taskFinal = task; childUGI.doAs(new PrivilegedExceptionAction<Object>() { @Override public Object run() throws Exception { // use job-specified working directory FileSystem.get(job).setWorkingDirectory(job.getWorkingDirectory()); taskFinal.run(job, umbilical); // run the task//这里调用的是org.apache.hadoop.mapred.Task接口的run方法 return null; } } |
org.apache.hadoop.mapred.Task接口的实现类包括MapTask和ReduceTask。
ReduceTask的run方法分析
public void run(JobConf job, final TaskUmbilicalProtocol umbilical) throws IOException, InterruptedException, ClassNotFoundException { job.setBoolean(JobContext.SKIP_RECORDS, isSkipping()); if (isMapOrReduce()) { copyPhase = getProgress().addPhase("copy"); sortPhase = getProgress().addPhase("sort"); reducePhase = getProgress().addPhase("reduce"); } // start thread that will handle communication with parent TaskReporter reporter = startReporter(umbilical); boolean useNewApi = job.getUseNewReducer(); initialize(job, getJobID(), reporter, useNewApi); // check if it is a cleanupJobTask if (jobCleanup) { runJobCleanupTask(umbilical, reporter); return; } if (jobSetup) { runJobSetupTask(umbilical, reporter); return; } if (taskCleanup) { runTaskCleanupTask(umbilical, reporter); return; } // Initialize the codec codec = initCodec(); RawKeyValueIterator rIter = null; ShuffleConsumerPlugin shuffleConsumerPlugin = null; Class combinerClass = conf.getCombinerClass(); CombineOutputCollector combineCollector = (null != combinerClass) ? new CombineOutputCollector(reduceCombineOutputCounter, reporter, conf) : null; Class<? extends ShuffleConsumerPlugin> clazz = job.getClass(MRConfig.SHUFFLE_CONSUMER_PLUGIN, Shuffle.class, ShuffleConsumerPlugin.class); //接口ShuffleConsumerPlugin的实现类Shuffle,可自定义实现接口来实现shuffle功能 shuffleConsumerPlugin = ReflectionUtils.newInstance(clazz, job); LOG.info("Using ShuffleConsumerPlugin: " + shuffleConsumerPlugin); ShuffleConsumerPlugin.Context shuffleContext = new ShuffleConsumerPlugin.Context(getTaskID(), job, FileSystem.getLocal(job), umbilical, super.lDirAlloc, reporter, codec, combinerClass, combineCollector, spilledRecordsCounter, reduceCombineInputCounter, shuffledMapsCounter, reduceShuffleBytes, failedShuffleCounter, mergedMapOutputsCounter, taskStatus, copyPhase, sortPhase, this, mapOutputFile, localMapFiles); shuffleConsumerPlugin.init(shuffleContext); rIter = shuffleConsumerPlugin.run();//Shuffle.run()里多个线程Fetcher.run()里copyFromHost获取数据 // free up the data structures mapOutputFilesOnDisk.clear(); sortPhase.complete(); // sort is complete setPhase(TaskStatus.Phase.REDUCE); statusUpdate(umbilical); Class keyClass = job.getMapOutputKeyClass(); Class valueClass = job.getMapOutputValueClass(); RawComparator comparator = job.getOutputValueGroupingComparator(); if (useNewApi) { runNewReducer(job, umbilical, reporter, rIter, comparator, keyClass, valueClass); } else { runOldReducer(job, umbilical, reporter, rIter, comparator, keyClass, valueClass); } shuffleConsumerPlugin.close(); done(umbilical, reporter); } |
runNewReducer方法分析
private <INKEY,INVALUE,OUTKEY,OUTVALUE> void runNewReducer(JobConf job, final TaskUmbilicalProtocol umbilical, final TaskReporter reporter, RawKeyValueIterator rIter, RawComparator<INKEY> comparator, Class<INKEY> keyClass, Class<INVALUE> valueClass ) throws IOException,InterruptedException, ClassNotFoundException { // wrap value iterator to report progress. final RawKeyValueIterator rawIter = rIter; rIter = new RawKeyValueIterator() { public void close() throws IOException { rawIter.close(); } public DataInputBuffer getKey() throws IOException { return rawIter.getKey(); } public Progress getProgress() { return rawIter.getProgress(); } public DataInputBuffer getValue() throws IOException { return rawIter.getValue(); } public boolean next() throws IOException { boolean ret = rawIter.next(); reporter.setProgress(rawIter.getProgress().getProgress()); return ret; } }; // make a task context so we can get the classes org.apache.hadoop.mapreduce.TaskAttemptContext taskContext = new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, getTaskID(), reporter);//TaskAttemptContextImpl主要包含的是这个MapReduce任务的状态和过程信息 // make a reducer org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE> reducer = (org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE>) ReflectionUtils.newInstance(taskContext.getReducerClass(), job);//反射获取用户自定义的Reducer类 org.apache.hadoop.mapreduce.RecordWriter<OUTKEY,OUTVALUE> trackedRW = new NewTrackingRecordWriter<OUTKEY, OUTVALUE>(this, taskContext);//WrappedReducer定义RecordWriter,最后会调用RecordWriter接口的write方法写数据 job.setBoolean("mapred.skip.on", isSkipping()); job.setBoolean(JobContext.SKIP_RECORDS, isSkipping()); org.apache.hadoop.mapreduce.Reducer.Context reducerContext = createReduceContext(reducer, job, getTaskID(), rIter, reduceInputKeyCounter, reduceInputValueCounter, trackedRW, committer, reporter, comparator, keyClass, valueClass);//返回的是Task.createReduceContext方法。并调用了WrappedReducer.getReducerContext方法返回值 //WrappedReducer.getReducerContext方法,WrappedReducer内部类Context是Reducer内部抽象类Context的子类,WrappedReducer.Context.wirte调用接口ReduceContext的实现类ReduceContextImpl的父类TaskInputOutputContextImpl的方法write,也就到了抽象类RecordWriter的write方法。关于抽象类的各种输出子类我们再变成模型OutputFormat部分在详述 try { reducer.run(reducerContext);//执行reduce任务,在Reduce执行时一般如果进行输出都是使用context.write进行输出,也就是上面注释的过程了。 } finally { trackedRW.close(reducerContext); } } |
Shuffle.run()方法
public RawKeyValueIterator run() throws IOException, InterruptedException { // Scale the maximum events we fetch per RPC call to mitigate OOM issues // on the ApplicationMaster when a thundering herd of reducers fetch events // TODO: This should not be necessary after HADOOP-8942 int eventsPerReducer = Math.max(MIN_EVENTS_TO_FETCH, MAX_RPC_OUTSTANDING_EVENTS / jobConf.getNumReduceTasks()); int maxEventsToFetch = Math.min(MAX_EVENTS_TO_FETCH, eventsPerReducer); // Start the map-completion events fetcher thread final EventFetcher<K,V> eventFetcher = new EventFetcher<K,V>(reduceId, umbilical, scheduler, this, maxEventsToFetch); eventFetcher.start(); // Start the map-output fetcher threads boolean isLocal = localMapFiles != null; final int numFetchers = isLocal ? 1 : jobConf.getInt(MRJobConfig.SHUFFLE_PARALLEL_COPIES, 5); Fetcher<K,V>[] fetchers = new Fetcher[numFetchers]; if (isLocal) { fetchers[0] = new LocalFetcher<K, V>(jobConf, reduceId, scheduler, merger, reporter, metrics, this, reduceTask.getShuffleSecret(), localMapFiles); fetchers[0].start(); } else { for (int i=0; i < numFetchers; ++i) { fetchers[i] = new Fetcher<K,V>(jobConf, reduceId, scheduler, merger, reporter, metrics, this, reduceTask.getShuffleSecret()); fetchers[i].start();//Fetcher线程,从mapoutput获取数据 } } // Wait for shuffle to complete successfully while (!scheduler.waitUntilDone(PROGRESS_FREQUENCY)) { reporter.progress(); synchronized (this) { if (throwable != null) { throw new ShuffleError("error in shuffle in " + throwingThreadName, throwable); } } } // Stop the event-fetcher thread eventFetcher.shutDown(); // Stop the map-output fetcher threads for (Fetcher<K,V> fetcher : fetchers) { fetcher.shutDown(); } // stop the scheduler scheduler.close(); copyPhase.complete(); // copy is already complete taskStatus.setPhase(TaskStatus.Phase.SORT); reduceTask.statusUpdate(umbilical); // Finish the on-going merges... RawKeyValueIterator kvIter = null; try { kvIter = merger.close();//调用了MergeManagerImpl.close() } catch (Throwable e) { throw new ShuffleError("Error while doing final merge " , e); } // Sanity check synchronized (this) { if (throwable != null) { throw new ShuffleError("error in shuffle in " + throwingThreadName, throwable); } } return kvIter; } |
线程Fetcher.run()
public void run() { try { while (!stopped && !Thread.currentThread().isInterrupted()) { MapHost host = null; try { // If merge is on, block merger.waitForResource();//这里控制有一个线程进行merge操作的时候,阻塞。调用了MergeThread.waitForMerge()方法 // Get a host to shuffle from host = scheduler.getHost(); metrics.threadBusy(); // Shuffle copyFromHost(host);//调用copyFromHost } finally { if (host != null) { scheduler.freeHost(host); metrics.threadFree(); } } } } catch (InterruptedException ie) { return; } catch (Throwable t) { exceptionReporter.reportException(t); } } |
线程Fetcher.copyFromHost()
protected void copyFromHost(MapHost host) throws IOException { // reset retryStartTime for a new host retryStartTime = 0; // Get completed maps on 'host' List<TaskAttemptID> maps = scheduler.getMapsForHost(host); // Sanity check to catch hosts with only 'OBSOLETE' maps, // especially at the tail of large jobs if (maps.size() == 0) { return; } if(LOG.isDebugEnabled()) { LOG.debug("Fetcher " + id + " going to fetch from " + host + " for: " + maps); } // List of maps to be fetched yet Set<TaskAttemptID> remaining = new HashSet<TaskAttemptID>(maps); // Construct the url and connect URL url = getMapOutputURL(host, maps); DataInputStream input = openShuffleUrl(host, remaining, url); if (input == null) { return; } try { // Loop through available map-outputs and fetch them // On any error, faildTasks is not null and we exit // after putting back the remaining maps to the // yet_to_be_fetched list and marking the failed tasks. TaskAttemptID[] failedTasks = null; while (!remaining.isEmpty() && failedTasks == null) { try { failedTasks = copyMapOutput(host, input, remaining, fetchRetryEnabled);//调用copyMapOutput方法 } catch (IOException e) { // // Setup connection again if disconnected by NM connection.disconnect(); // Get map output from remaining tasks only. url = getMapOutputURL(host, remaining); input = openShuffleUrl(host, remaining, url); if (input == null) { return; } } } if(failedTasks != null && failedTasks.length > 0) { LOG.warn("copyMapOutput failed for tasks "+Arrays.toString(failedTasks)); scheduler.hostFailed(host.getHostName()); for(TaskAttemptID left: failedTasks) { scheduler.copyFailed(left, host, true, false); } } // Sanity check if (failedTasks == null && !remaining.isEmpty()) { throw new IOException("server didn't return all expected map outputs: " + remaining.size() + " left."); } input.close(); input = null; } finally { if (input != null) { IOUtils.cleanup(LOG, input); input = null; } for (TaskAttemptID left : remaining) { scheduler.putBackKnownMapOutput(host, left); } } } |
线程Fetcher.copyMapOutput()
private TaskAttemptID[] copyMapOutput(MapHost host, DataInputStream input, Set<TaskAttemptID> remaining, boolean canRetry) throws IOException { MapOutput<K,V> mapOutput = null; TaskAttemptID mapId = null; long decompressedLength = -1; long compressedLength = -1; try { long startTime = Time.monotonicNow(); int forReduce = -1; //Read the shuffle header try { ShuffleHeader header = new ShuffleHeader(); header.readFields(input); mapId = TaskAttemptID.forName(header.mapId); compressedLength = header.compressedLength; decompressedLength = header.uncompressedLength; forReduce = header.forReduce; } catch (IllegalArgumentException e) { badIdErrs.increment(1); LOG.warn("Invalid map id ", e); //Don't know which one was bad, so consider all of them as bad return remaining.toArray(new TaskAttemptID[remaining.size()]); } InputStream is = input; is = CryptoUtils.wrapIfNecessary(jobConf, is, compressedLength); compressedLength -= CryptoUtils.cryptoPadding(jobConf); decompressedLength -= CryptoUtils.cryptoPadding(jobConf); // Do some basic sanity verification if (!verifySanity(compressedLength, decompressedLength, forReduce, remaining, mapId)) { return new TaskAttemptID[] {mapId}; } if(LOG.isDebugEnabled()) { LOG.debug("header: " + mapId + ", len: " + compressedLength + ", decomp len: " + decompressedLength); } // Get the location for the map output - either in-memory or on-disk try { mapOutput = merger.reserve(mapId, decompressedLength, id);//获取mapoutput的地址,这里调用的是MergeManagerImpl.reserve(),返回的是两种OnDiskMapOutput和InMemoryMapOutput } catch (IOException ioe) { // kill this reduce attempt ioErrs.increment(1); scheduler.reportLocalError(ioe); return EMPTY_ATTEMPT_ID_ARRAY; } // Check if we can shuffle *now* ... if (mapOutput == null) { LOG.info("fetcher#" + id + " - MergeManager returned status WAIT ..."); //Not an error but wait to process data. return EMPTY_ATTEMPT_ID_ARRAY; } // The codec for lz0,lz4,snappy,bz2,etc. throw java.lang.InternalError // on decompression failures. Catching and re-throwing as IOException // to allow fetch failure logic to be processed try { // Go! LOG.info("fetcher#" + id + " about to shuffle output of map " + mapOutput.getMapId() + " decomp: " + decompressedLength + " len: " + compressedLength + " to " + mapOutput.getDescription()); mapOutput.shuffle(host, is, compressedLength, decompressedLength, metrics, reporter);//这里根据OnDiskMapOutput和InMemoryMapOutput调用各自的shuffle方法进行mapoutput数据的传输 } catch (java.lang.InternalError e) { LOG.warn("Failed to shuffle for fetcher#"+id, e); throw new IOException(e); } // Inform the shuffle scheduler long endTime = Time.monotonicNow(); // Reset retryStartTime as map task make progress if retried before. retryStartTime = 0; scheduler.copySucceeded(mapId, host, compressedLength, startTime, endTime, mapOutput); // Note successful shuffle remaining.remove(mapId); metrics.successFetch(); return null; } catch (IOException ioe) { if (mapOutput != null) { mapOutput.abort(); } if (canRetry) { checkTimeoutOrRetry(host, ioe); } ioErrs.increment(1); if (mapId == null || mapOutput == null) { LOG.warn("fetcher#" + id + " failed to read map header" + mapId + " decomp: " + decompressedLength + ", " + compressedLength, ioe); if(mapId == null) { return remaining.toArray(new TaskAttemptID[remaining.size()]); } else { return new TaskAttemptID[] {mapId}; } } LOG.warn("Failed to shuffle output of " + mapId + " from " + host.getHostName(), ioe); // Inform the shuffle-scheduler metrics.failedFetch(); return new TaskAttemptID[] {mapId}; } } |
MergeManagerImpl.close()
public RawKeyValueIterator close() throws Throwable { // Wait for on-going merges to complete if (memToMemMerger != null) { memToMemMerger.close(); } inMemoryMerger.close(); onDiskMerger.close(); List<InMemoryMapOutput<K, V>> memory = new ArrayList<InMemoryMapOutput<K, V>>(inMemoryMergedMapOutputs); inMemoryMergedMapOutputs.clear(); memory.addAll(inMemoryMapOutputs); inMemoryMapOutputs.clear(); List<CompressAwarePath> disk = new ArrayList<CompressAwarePath>(onDiskMapOutputs); onDiskMapOutputs.clear(); return finalMerge(jobConf, rfs, memory, disk);//这一步是sort阶段 } |
MergeManagerImpl.finalMerge()方法
private RawKeyValueIterator finalMerge(JobConf job, FileSystem fs, List<InMemoryMapOutput<K,V>> inMemoryMapOutputs, List<CompressAwarePath> onDiskMapOutputs ) throws IOException { LOG.info("finalMerge called with " + inMemoryMapOutputs.size() + " in-memory map-outputs and " + onDiskMapOutputs.size() + " on-disk map-outputs"); final long maxInMemReduce = getMaxInMemReduceLimit(); // merge config params Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass(); Class<V> valueClass = (Class<V>)job.getMapOutputValueClass(); boolean keepInputs = job.getKeepFailedTaskFiles(); final Path tmpDir = new Path(reduceId.toString()); final RawComparator<K> comparator = (RawComparator<K>)job.getOutputKeyComparator(); // segments required to vacate memory List<Segment<K,V>> memDiskSegments = new ArrayList<Segment<K,V>>(); long inMemToDiskBytes = 0; boolean mergePhaseFinished = false; if (inMemoryMapOutputs.size() > 0) { TaskID mapId = inMemoryMapOutputs.get(0).getMapId().getTaskID(); inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, maxInMemReduce); final int numMemDiskSegments = memDiskSegments.size(); if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) { // If we reach here, it implies that we have less than io.sort.factor // disk segments and this will be incremented by 1 (result of the // memory segments merge). Since this total would still be // <= io.sort.factor, we will not do any more intermediate merges, // the merge of all these disk segments would be directly fed to the // reduce method mergePhaseFinished = true; // must spill to disk, but can't retain in-mem for intermediate merge final Path outputPath = mapOutputFile.getInputFileForWrite(mapId, inMemToDiskBytes).suffix( Task.MERGED_OUTPUT_PREFIX); final RawKeyValueIterator rIter = Merger.merge(job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, reporter, spilledRecordsCounter, null, mergePhase); FSDataOutputStream out = CryptoUtils.wrapIfNecessary(job, fs.create(outputPath)); Writer<K, V> writer = new Writer<K, V>(job, out, keyClass, valueClass, codec, null, true); try { Merger.writeFile(rIter, writer, reporter, job); writer.close(); onDiskMapOutputs.add(new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength()));//inMemoryMapOutputs也会merge到onDiskMapOutputs中 writer = null; // add to list of final disk outputs. } catch (IOException e) { if (null != outputPath) { try { fs.delete(outputPath, true); } catch (IOException ie) { // NOTHING } } throw e; } finally { if (null != writer) { writer.close(); } } LOG.info("Merged " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes to disk to satisfy " + "reduce memory limit"); inMemToDiskBytes = 0; memDiskSegments.clear(); } else if (inMemToDiskBytes != 0) { LOG.info("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for " + "intermediate, on-disk merge"); } } // segments on disk List<Segment<K,V>> diskSegments = new ArrayList<Segment<K,V>>(); long onDiskBytes = inMemToDiskBytes; long rawBytes = inMemToDiskBytes; CompressAwarePath[] onDisk = onDiskMapOutputs.toArray( new CompressAwarePath[onDiskMapOutputs.size()]); for (CompressAwarePath file : onDisk) { long fileLength = fs.getFileStatus(file).getLen(); onDiskBytes += fileLength; rawBytes += (file.getRawDataLength() > 0) ? file.getRawDataLength() : fileLength; LOG.debug("Disk file: " + file + " Length is " + fileLength); diskSegments.add(new Segment<K, V>(job, fs, file, codec, keepInputs, (file.toString().endsWith( Task.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter), file.getRawDataLength() )); } LOG.info("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk"); Collections.sort(diskSegments, new Comparator<Segment<K,V>>() { public int compare(Segment<K, V> o1, Segment<K, V> o2) { if (o1.getLength() == o2.getLength()) { return 0; } return o1.getLength() < o2.getLength() ? -1 : 1; } }); // build final list of segments from merged backed by disk + in-mem List<Segment<K,V>> finalSegments = new ArrayList<Segment<K,V>>(); long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0); LOG.info("Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce"); if (0 != onDiskBytes) { final int numInMemSegments = memDiskSegments.size(); diskSegments.addAll(0, memDiskSegments); memDiskSegments.clear(); // Pass mergePhase only if there is a going to be intermediate // merges. See comment where mergePhaseFinished is being set Progress thisPhase = (mergePhaseFinished) ? null : mergePhase; RawKeyValueIterator diskMerge = Merger.merge( job, fs, keyClass, valueClass, codec, diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, reporter, false, spilledRecordsCounter, null, thisPhase); diskSegments.clear(); if (0 == finalSegments.size()) { return diskMerge; } finalSegments.add(new Segment<K,V>( new RawKVIteratorReader(diskMerge, onDiskBytes), true, rawBytes));//这里是排序操作 } return Merger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir, comparator, reporter, spilledRecordsCounter, null, null); } |