MapTask任务可以分解成Read、Map、Collect、Spill、Merge(Combine)五个阶段。
- Read阶段:MapTask通过用户编写的RecordReader,从输入InputSplit中解析出一个个key/value。
- Map阶段:该阶段主要是将解析出的key/value交给用户编写的map()函数处理,并产生一系列新的key/value。
- Collect阶段:在用户编写的map()函数中,当数据处理完成后,一般会调用OutputCollector.collect()输出结果。在该函数内部,它会将生成的key/value分片(通过调用Partitioner),并写入一个环形内存缓冲区内。
- Spill阶段:即“溢写”,当环形缓冲区满后,MapReduce会将数据写到本地磁盘上,生成一个临时文件,需要注意的是,将数据写入本地磁盘之前,先要对数据进行一次本地排序,并在必要时对数据进行合并、压缩等操作。
- Merge(Combine)阶段:当所有数据处理完成后,Map Task对所有临时文件进行一次合并,以确保最终只会生成一个数据文件。
org.apache.hadoop.mapred.YarnChild的main方法
final Task taskFinal = task; childUGI.doAs(new PrivilegedExceptionAction<Object>() { @Override public Object run() throws Exception { // use job-specified working directory FileSystem.get(job).setWorkingDirectory(job.getWorkingDirectory()); taskFinal.run(job, umbilical); // run the task//这里调用的是org.apache.hadoop.mapred.Task接口的run方法 return null; } } |
org.apache.hadoop.mapred.Task接口的实现类包括MapTask和ReduceTask。
MapTask里的run方法分析
public void run(final JobConf job, final TaskUmbilicalProtocol umbilical) throws IOException, ClassNotFoundException, InterruptedException { this.umbilical = umbilical; if (isMapTask()) { // If there are no reducers then there won't be any sort. Hence the map // phase will govern the entire attempt's progress. if (conf.getNumReduceTasks() == 0) { mapPhase = getProgress().addPhase("map", 1.0f); } else { // If there are reducers then the entire attempt's progress will be // split between the map phase (67%) and the sort phase (33%). mapPhase = getProgress().addPhase("map", 0.667f); sortPhase = getProgress().addPhase("sort", 0.333f); } } TaskReporter reporter = startReporter(umbilical); //判断是否使用NewMapper对应参数mapred.mapper.new-api boolean useNewApi = job.getUseNewMapper(); //主要设置了task任务临时输出目录FileOutputFormat.setWorkOutputPath //和进程树监控ResourceCalculatorProcessTree initialize(job, getJobID(), reporter, useNewApi); // check if it is a cleanupJobTask if (jobCleanup) { runJobCleanupTask(umbilical, reporter);//清理任务 return; } if (jobSetup) { runJobSetupTask(umbilical, reporter);//创建Task临时输出目录 return; } if (taskCleanup) { runTaskCleanupTask(umbilical, reporter);//关闭Task任务 return; } if (useNewApi) { runNewMapper(job, splitMetaInfo, umbilical, reporter);//启动NewMapper } else { runOldMapper(job, splitMetaInfo, umbilical, reporter);//启动OldMapper } done(umbilical, reporter);//更新counter、state、reporter } |
默认使用runOldMapper方法,通过设置参数mapred.mapper.new-api为true来使用runNewMapper方法。
分析runNewMapper方法
private <INKEY,INVALUE,OUTKEY,OUTVALUE> void runNewMapper(final JobConf job, final TaskSplitIndex splitIndex, final TaskUmbilicalProtocol umbilical, TaskReporter reporter ) throws IOException, ClassNotFoundException, InterruptedException { // make a task context so we can get the classes org.apache.hadoop.mapreduce.TaskAttemptContext taskContext = new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, getTaskID(), reporter); // make a mapper org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper = (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>) ReflectionUtils.newInstance(taskContext.getMapperClass(), job); // make the input format org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat = (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>) ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job); // rebuild the input split org.apache.hadoop.mapreduce.InputSplit split = null; split = getSplitDetails(new Path(splitIndex.getSplitLocation()), splitIndex.getStartOffset()); LOG.info("Processing split: " + split); org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input = new NewTrackingRecordReader<INKEY,INVALUE> (split, inputFormat, reporter, taskContext);//定义RecordReader job.setBoolean(JobContext.SKIP_RECORDS, isSkipping()); org.apache.hadoop.mapreduce.RecordWriter output = null; // get an output object,定义RecordWriter, 这里比较重要的是NewOutputCollector,由NewOutputCollector.write中调用MapOutputBuffer.collect方法来收集Map计算结果,然后调用线程SpillThread实现spill操作 if (job.getNumReduceTasks() == 0) {//reduce个数为空直接写入HDFS,否则写入临时数据磁盘目录 output = new NewDirectOutputCollector(taskContext, job, umbilical, reporter); } else { output = new NewOutputCollector(taskContext, job, umbilical, reporter); } org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> mapContext = new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), input, output, committer, reporter, split);//定义MapContextImpl org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context mapperContext = new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext( mapContext);//将mapContext包装成Mapper类中抽象内部类Context的子类Context try { //定义数据读入,这里是Read阶段 input.initialize(split, mapperContext); //执行Map计算任务,这里map阶段,Mapper类中run方法调用了map方法, //map方法被用户自定义的Map覆写,如果需要写数据时,调用NewOutputCollector.write mapper.run(mapperContext); mapPhase.complete(); setPhase(TaskStatus.Phase.SORT); statusUpdate(umbilical); input.close(); input = null; output.close(mapperContext);//这一步为Merge阶段,close调用flush方法,flush方法调用mergeParts方法 output = null; } finally { closeQuietly(input); closeQuietly(output, mapperContext); } } |
接下来我们分析NewOutputCollector
private class NewOutputCollector<K,V> extends org.apache.hadoop.mapreduce.RecordWriter<K,V> { private final MapOutputCollector<K,V> collector;//MapOutputCollector private final org.apache.hadoop.mapreduce.Partitioner<K,V> partitioner; private final int partitions; @SuppressWarnings("unchecked") NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext, JobConf job, TaskUmbilicalProtocol umbilical, TaskReporter reporter ) throws IOException, ClassNotFoundException { collector = createSortingCollector(job , reporter); partitions = jobContext.getNumReduceTasks(); //获取partitioner if (partitions > 1) { partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>) ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job); } else { partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() { @Override public int getPartition(K key, V value, int numPartitions) { return partitions - 1; } }; } } @Override public void write(K key, V value) throws IOException, InterruptedException { //调用MapOutputCollector接口的实现类MapTask.MapOutputBuffer.collect(),这个阶段是collect阶段 collector.collect(key, value, partitioner.getPartition(key, value, partitions)); } @Override public void close(TaskAttemptContext context ) throws IOException,InterruptedException { try { collector.flush(); } catch (ClassNotFoundException cnf) { throw new IOException("can't find class ", cnf); } collector.close(); } } |
接着分析MapTask.MapOutputBuffer的collect方法
/** * Serialize the key, value to intermediate storage. * When this method returns, kvindex must refer to sufficient unused * storage to store one METADATA. */ public synchronized void collect(K key, V value, final int partition ) throws IOException { reporter.progress(); if (key.getClass() != keyClass) { throw new IOException("Type mismatch in key from map: expected " + keyClass.getName() + ", received " + key.getClass().getName()); } if (value.getClass() != valClass) { throw new IOException("Type mismatch in value from map: expected " + valClass.getName() + ", received " + value.getClass().getName()); } if (partition < 0 || partition >= partitions) { throw new IOException("Illegal partition for " + key + " (" + partition + ")"); } checkSpillException(); bufferRemaining -= METASIZE; if (bufferRemaining <= 0) { // start spill if the thread is not running and the soft limit has been // reached spillLock.lock(); try { do { if (!spillInProgress) { final int kvbidx = 4 * kvindex; final int kvbend = 4 * kvend; // serialized, unspilled bytes always lie between kvindex and // bufindex, crossing the equator. Note that any void space // created by a reset must be included in "used" bytes final int bUsed = distanceTo(kvbidx, bufindex); final boolean bufsoftlimit = bUsed >= softLimit; if ((kvbend + METASIZE) % kvbuffer.length != equator - (equator % METASIZE)) { // spill finished, reclaim space resetSpill(); bufferRemaining = Math.min( distanceTo(bufindex, kvbidx) - 2 * METASIZE, softLimit - bUsed) - METASIZE; continue; } else if (bufsoftlimit && kvindex != kvend) { // spill records, if any collected; check latter, as it may // be possible for metadata alignment to hit spill pcnt startSpill(); final int avgRec = (int) (mapOutputByteCounter.getCounter() / mapOutputRecordCounter.getCounter()); // leave at least half the split buffer for serialization data // ensure that kvindex >= bufindex final int distkvi = distanceTo(bufindex, kvbidx); final int newPos = (bufindex + Math.max(2 * METASIZE - 1, Math.min(distkvi / 2, distkvi / (METASIZE + avgRec) * METASIZE))) % kvbuffer.length; setEquator(newPos); bufmark = bufindex = newPos; final int serBound = 4 * kvend; // bytes remaining before the lock must be held and limits // checked is the minimum of three arcs: the metadata space, the // serialization space, and the soft limit bufferRemaining = Math.min( // metadata max distanceTo(bufend, newPos), Math.min( // serialization max distanceTo(newPos, serBound), // soft limit softLimit)) - 2 * METASIZE; } } } while (false); } finally { spillLock.unlock(); } } try { // serialize key bytes into buffer int keystart = bufindex; keySerializer.serialize(key); if (bufindex < keystart) { // wrapped the key; must make contiguous bb.shiftBufferedKey(); keystart = 0; } // serialize value bytes into buffer final int valstart = bufindex; valSerializer.serialize(value); // It's possible for records to have zero length, i.e. the serializer // will perform no writes. To ensure that the boundary conditions are // checked and that the kvindex invariant is maintained, perform a // zero-length write into the buffer. The logic monitoring this could be // moved into collect, but this is cleaner and inexpensive. For now, it // is acceptable. bb.write(b0, 0, 0); // the record must be marked after the preceding write, as the metadata // for this record are not yet written int valend = bb.markRecord(); mapOutputRecordCounter.increment(1); mapOutputByteCounter.increment( distanceTo(keystart, valend, bufvoid)); // write accounting info kvmeta.put(kvindex + PARTITION, partition); kvmeta.put(kvindex + KEYSTART, keystart); kvmeta.put(kvindex + VALSTART, valstart); kvmeta.put(kvindex + VALLEN, distanceTo(valstart, valend)); // advance kvindex kvindex = (kvindex - NMETA + kvmeta.capacity()) % kvmeta.capacity(); } catch (MapBufferTooSmallException e) { LOG.info("Record too large for in-memory buffer: " + e.getMessage()); spillSingleRecord(key, value, partition); mapOutputRecordCounter.increment(1); return; } } |
MapTask.MapOutputBuffer静态类初始化时会创建SpillThread线程,分析SpillThread类的run()方法中又调用了sortAndSpill()方法.
protected class SpillThread extends Thread { @Override public void run() { spillLock.lock(); spillThreadRunning = true; try { while (true) { spillDone.signal(); while (!spillInProgress) { spillReady.await(); } try { spillLock.unlock(); sortAndSpill();//这里是spill阶段。 } catch (Throwable t) { sortSpillException = t; } finally { spillLock.lock(); if (bufend < bufstart) { bufvoid = kvbuffer.length; } kvstart = kvend; bufstart = bufend; spillInProgress = false; } } } catch (InterruptedException e) { Thread.currentThread().interrupt(); } finally { spillLock.unlock(); spillThreadRunning = false; } } } |
接着分析sortAndSpill()方法
private void sortAndSpill() throws IOException, ClassNotFoundException, InterruptedException { //approximate the length of the output file to be the length of the //buffer + header lengths for the partitions final long size = distanceTo(bufstart, bufend, bufvoid) + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try { // create spill file final SpillRecord spillRec = new SpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size); out = rfs.create(filename); final int mstart = kvend / NMETA; final int mend = 1 + // kvend is a valid record (kvstart >= kvend ? kvstart : kvmeta.capacity() + kvstart) / NMETA; sorter.sort(MapOutputBuffer.this, mstart, mend, reporter); int spindex = mstart; final IndexRecord rec = new IndexRecord(); final InMemValBytes value = new InMemValBytes(); for (int i = 0; i < partitions; ++i) { IFile.Writer<K, V> writer = null; try { long segmentStart = out.getPos(); FSDataOutputStream partitionOut = CryptoUtils.wrapIfNecessary(job, out); writer = new Writer<K, V>(job, partitionOut, keyClass, valClass, codec, spilledRecordsCounter); //通过CombinerRunner抽象类的create方法判断是否设置了combiner,这里使用JobContext.getCombinerClass() if (combinerRunner == null) { // spill directly DataInputBuffer key = new DataInputBuffer(); while (spindex < mend && kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) { final int kvoff = offsetFor(spindex % maxRec); int keystart = kvmeta.get(kvoff + KEYSTART); int valstart = kvmeta.get(kvoff + VALSTART); key.reset(kvbuffer, keystart, valstart - keystart); getVBytesForOffset(kvoff, value); writer.append(key, value); ++spindex; } } else { int spstart = spindex; while (spindex < mend && kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) { ++spindex; } // Note: we would like to avoid the combiner if we've fewer // than some threshold of records for a partition if (spstart != spindex) { combineCollector.setWriter(writer);//调用IFile.writer RawKeyValueIterator kvIter = new MRResultIterator(spstart, spindex); combinerRunner.combine(kvIter, combineCollector);//这里调用combine操作NewCombinerRunner.combine } } // close the writer writer.close(); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job); rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job); spillRec.putIndex(rec, i); writer = null; } finally { if (null != writer) writer.close(); } } if (totalIndexCacheMemory >= indexCacheMemoryLimit) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, job); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } LOG.info("Finished spill " + numSpills); ++numSpills; } finally { if (out != null) out.close(); } } |
分析NewCombinerRunner及其combine方法
protected static class NewCombinerRunner<K, V> extends CombinerRunner<K,V> { private final Class<? extends org.apache.hadoop.mapreduce.Reducer<K,V,K,V>> reducerClass; private final org.apache.hadoop.mapreduce.TaskAttemptID taskId; private final RawComparator<K> comparator; private final Class<K> keyClass; private final Class<V> valueClass; private final org.apache.hadoop.mapreduce.OutputCommitter committer; @SuppressWarnings("unchecked") NewCombinerRunner(Class reducerClass, JobConf job, org.apache.hadoop.mapreduce.TaskAttemptID taskId, org.apache.hadoop.mapreduce.TaskAttemptContext context, Counters.Counter inputCounter, TaskReporter reporter, org.apache.hadoop.mapreduce.OutputCommitter committer) { super(inputCounter, job, reporter); this.reducerClass = reducerClass; this.taskId = taskId; keyClass = (Class<K>) context.getMapOutputKeyClass(); valueClass = (Class<V>) context.getMapOutputValueClass(); comparator = (RawComparator<K>) context.getCombinerKeyGroupingComparator(); this.committer = committer; } private static class OutputConverter<K,V> extends org.apache.hadoop.mapreduce.RecordWriter<K,V> { OutputCollector<K,V> output; OutputConverter(OutputCollector<K,V> output) { this.output = output; } @Override public void close(org.apache.hadoop.mapreduce.TaskAttemptContext context){ } @Override public void write(K key, V value ) throws IOException, InterruptedException { output.collect(key,value); } } @SuppressWarnings("unchecked") @Override public void combine(RawKeyValueIterator iterator, OutputCollector<K,V> collector ) throws IOException, InterruptedException, ClassNotFoundException { // make a reducer org.apache.hadoop.mapreduce.Reducer<K,V,K,V> reducer = (org.apache.hadoop.mapreduce.Reducer<K,V,K,V>) ReflectionUtils.newInstance(reducerClass, job); org.apache.hadoop.mapreduce.Reducer.Context reducerContext = createReduceContext(reducer, job, taskId, iterator, null, inputCounter, new OutputConverter(collector), committer, reporter, comparator, keyClass, valueClass); reducer.run(reducerContext); } } |
分析MapOutputBuffer.mergeParts方法
private void mergeParts() throws IOException, InterruptedException, ClassNotFoundException { // get the approximate size of the final output/index files long finalOutFileSize = 0; long finalIndexFileSize = 0; final Path[] filename = new Path[numSpills]; final TaskAttemptID mapId = getTaskID(); for(int i = 0; i < numSpills; i++) { filename[i] = mapOutputFile.getSpillFile(i); finalOutFileSize += rfs.getFileStatus(filename[i]).getLen(); } if (numSpills == 1) { //the spill is the final output sameVolRename(filename[0], mapOutputFile.getOutputFileForWriteInVolume(filename[0])); if (indexCacheList.size() == 0) { sameVolRename(mapOutputFile.getSpillIndexFile(0), mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0])); } else { indexCacheList.get(0).writeToFile( mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]), job); } sortPhase.complete(); return; } // read in paged indices for (int i = indexCacheList.size(); i < numSpills; ++i) { Path indexFileName = mapOutputFile.getSpillIndexFile(i); indexCacheList.add(new SpillRecord(indexFileName, job)); } //make correction in the length to include the sequence file header //lengths for each partition finalOutFileSize += partitions * APPROX_HEADER_LENGTH; finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH; Path finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize); Path finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize); //The output stream for the final single output file FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096); if (numSpills == 0) { //create dummy files IndexRecord rec = new IndexRecord(); SpillRecord sr = new SpillRecord(partitions); try { for (int i = 0; i < partitions; i++) { long segmentStart = finalOut.getPos(); FSDataOutputStream finalPartitionOut = CryptoUtils.wrapIfNecessary(job, finalOut); Writer<K, V> writer = new Writer<K, V>(job, finalPartitionOut, keyClass, valClass, codec, null); writer.close(); rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job); rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job); sr.putIndex(rec, i); } sr.writeToFile(finalIndexFile, job); } finally { finalOut.close(); } sortPhase.complete(); return; } { sortPhase.addPhases(partitions); // Divide sort phase into sub-phases IndexRecord rec = new IndexRecord(); final SpillRecord spillRec = new SpillRecord(partitions); for (int parts = 0; parts < partitions; parts++) { //create the segments to be merged List<Segment<K,V>> segmentList = new ArrayList<Segment<K, V>>(numSpills); for(int i = 0; i < numSpills; i++) { IndexRecord indexRecord = indexCacheList.get(i).getIndex(parts); Segment<K,V> s = new Segment<K,V>(job, rfs, filename[i], indexRecord.startOffset, indexRecord.partLength, codec, true); segmentList.add(i, s); if (LOG.isDebugEnabled()) { LOG.debug("MapId=" + mapId + " Reducer=" + parts + "Spill =" + i + "(" + indexRecord.startOffset + "," + indexRecord.rawLength + ", " + indexRecord.partLength + ")"); } } int mergeFactor = job.getInt(JobContext.IO_SORT_FACTOR, 100);//参数设置mapreduce.task.io.sort.factor // sort the segments only if there are intermediate merges boolean sortSegments = segmentList.size() > mergeFactor; //merge @SuppressWarnings("unchecked") RawKeyValueIterator kvIter = Merger.merge(job, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(mapId.toString()), job.getOutputKeyComparator(), reporter, sortSegments, null, spilledRecordsCounter, sortPhase.phase(), TaskType.MAP); //write merged output to disk long segmentStart = finalOut.getPos(); FSDataOutputStream finalPartitionOut = CryptoUtils.wrapIfNecessary(job, finalOut); Writer<K, V> writer = new Writer<K, V>(job, finalPartitionOut, keyClass, valClass, codec, spilledRecordsCounter); if (combinerRunner == null || numSpills < minSpillsForCombine) { Merger.writeFile(kvIter, writer, reporter, job); } else { combineCollector.setWriter(writer); combinerRunner.combine(kvIter, combineCollector); } //close writer.close(); sortPhase.startNextPhase(); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job); rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job); spillRec.putIndex(rec, parts); } spillRec.writeToFile(finalIndexFile, job); finalOut.close(); for(int i = 0; i < numSpills; i++) { rfs.delete(filename[i],true); } } } |