Map的源码分析
org. apache. hadoop. mapreduce. TaskAttemptContext taskContext =
new org. apache. hadoop. mapreduce. task. TaskAttemptContextImpl ( job, getTaskID ( ) , reporter) ;
org. apache. hadoop. mapreduce. Mapper< INKEY, INVALUE, OUTKEY, OUTVALUE> mapper =
( org. apache. hadoop. mapreduce. Mapper< INKEY, INVALUE, OUTKEY, OUTVALUE> )
ReflectionUtils. newInstance ( taskContext. getMapperClass ( ) , job) ;
org. apache. hadoop. mapreduce. InputFormat< INKEY, INVALUE> inputFormat =
( org. apache. hadoop. mapreduce. InputFormat< INKEY, INVALUE> )
ReflectionUtils. newInstance ( taskContext. getInputFormatClass ( ) , job) ;
org. apache. hadoop. mapreduce. InputSplit split = null;
split = getSplitDetails ( new Path ( splitIndex. getSplitLocation ( ) ) , splitIndex. getStartOffset ( ) ) ;
org. apache. hadoop. mapreduce. RecordReader< INKEY, INVALUE> input =
new NewTrackingRecordReader < INKEY, INVALUE> ( split, inputFormat, reporter, taskContext) ;
this . real = inputFormat. createRecordReader ( split, taskContext) ;
return new LineRecordReader ( recordDelimiterBytes) ;
org. apache. hadoop. mapreduce. RecordWriter output = null;
output = new NewOutputCollector ( taskContext, job, umbilical, reporter) ;
{
collector = createSortingCollector ( job, reporter) ;
{
Class< ? > [ ] collectorClasses =
job. getClasses ( JobContext. MAP_OUTPUT_COLLECTOR_CLASS_ATTR, MapOutputBuffer. class ) ;
MapOutputCollector< KEY, VALUE> collector =
ReflectionUtils. newInstance ( subclazz, job) ;
collector. init ( context) ;
{
final float spillper = job. getFloat ( JobContext. MAP_SORT_SPILL_PERCENT, ( float ) 0.8 ) ;
final int sortmb = job. getInt ( JobContext. IO_SORT_MB, 100 ) ;
sorter = ReflectionUtils. newInstance ( job. getClass ( "map.sort.class" ,
QuickSort. class , IndexedSorter. class ) , job) ;
int maxMemUsage = sortmb << 20 ;
maxMemUsage -= maxMemUsage % METASIZE;
kvbuffer = new byte [ maxMemUsage] ;
comparator = job. getOutputKeyComparator ( ) ;
spillThread. setDaemon ( true ) ;
spillThread. setName ( "SpillThread" ) ;
spillThread. start ( ) ;
spillDone. await ( ) ;
}
}
partitions = jobContext. getNumReduceTasks ( ) ;
if ( partitions > 1 ) {
partitioner = ( org. apache. hadoop. mapreduce. Partitioner< K, V> )
ReflectionUtils. newInstance ( jobContext. getPartitionerClass ( ) , job) ;
public int getPartition ( K key, V value, int numReduceTasks) {
return ( key. hashCode ( ) & Integer. MAX_VALUE) % numReduceTasks;
}
} else {
partitioner = new org. apache. hadoop. mapreduce. Partitioner < K, V> ( ) {
@Override
public int getPartition ( K key, V value, int numPartitions) {
return partitions - 1 ;
}
} ;
}
}
org. apache. hadoop. mapreduce. MapContext< INKEY, INVALUE, OUTKEY, OUTVALUE>
mapContext = new MapContextImpl < INKEY, INVALUE, OUTKEY, OUTVALUE> ( job, getTaskID ( ) ,
input, output,
committer,
reporter, split) ;
org. apache. hadoop. mapreduce. Mapper< INKEY, INVALUE, OUTKEY, OUTVALUE> . Context
mapperContext = new WrappedMapper < INKEY, INVALUE, OUTKEY, OUTVALUE> ( ) . getMapContext (
mapContext) ;
context. write ( new Text ( val) , one) ;
collector. collect ( key, value, partitioner. getPartition ( key, value, partitions) ) ;
{
if ( bufferRemaining <= 0 ) {
startSpill ( ) ;
}
}
output. close ( mapperContext) ;
{
collector. flush ( ) ;
sortAndSpill ( ) ;
mergeParts ( ) ;
{
final Path[ ] filename = new Path [ numSpills] ;
for ( int i = 0 ; i < numSpills; i++ ) {
filename[ i] = mapOutputFile. getSpillFile ( i) ;
finalOutFileSize += rfs. getFileStatus ( filename[ i] ) . getLen ( ) ;
}
}
collector. close ( ) ;
}
Reduce的源码分析
if ( isMapOrReduce ( ) ) {
copyPhase = getProgress ( ) . addPhase ( "copy" ) ;
sortPhase = getProgress ( ) . addPhase ( "sort" ) ;
reducePhase = getProgress ( ) . addPhase ( "reduce" ) ;
}
Class< ? extends ShuffleConsumerPlugin > clazz =
job. getClass ( MRConfig. SHUFFLE_CONSUMER_PLUGIN, Shuffle. class , ShuffleConsumerPlugin. class ) ;
ShuffleConsumerPlugin shuffleConsumerPlugin = null;
shuffleConsumerPlugin = ReflectionUtils. newInstance ( clazz, job) ;
ShuffleConsumerPlugin. Context shuffleContext =
new ShuffleConsumerPlugin. Context ( getTaskID ( ) , job, FileSystem. getLocal ( job) , umbilical,
super . lDirAlloc, reporter, codec,
combinerClass, combineCollector,
spilledRecordsCounter, reduceCombineInputCounter,
shuffledMapsCounter,
reduceShuffleBytes, failedShuffleCounter,
mergedMapOutputsCounter,
taskStatus, copyPhase, sortPhase, this ,
mapOutputFile, localMapFiles) ;
shuffleConsumerPlugin. init ( shuffleContext) ;
RawKeyValueIterator rIter = null;
rIter = shuffleConsumerPlugin. run ( ) ;
{
boolean isLocal = localMapFiles != null;
final int numFetchers = isLocal ? 1 : jobConf. getInt ( MRJobConfig. SHUFFLE_PARALLEL_COPIES, 5 ) ;
Fetcher< K, V> [ ] fetchers = new Fetcher [ numFetchers] ;
for ( int i= 0 ; i < numFetchers; ++ i) {
fetchers[ i] =
new Fetcher < K, V> ( jobConf, reduceId, scheduler, merger, reporter, metrics, this ,
reduceTask. getShuffleSecret ( ) ) ;
fetchers[ i] . start ( ) ;
{
copyFromHost ( host) ;
}
}
RawKeyValueIterator kvIter = null;
kvIter = merger. close ( ) ;
{
return finalMerge ( jobConf, rfs, memory, disk) ;
{
return Merger. merge ( job, fs, keyClass, valueClass, finalSegments, finalSegments. size ( ) , tmpDir, comparator, reporter, spilledRecordsCounter, null, null) ;
{
return new MergeQueue < K, V> ( ) ;
}
}
}
return kvIter;
}
mapOutputFilesOnDisk. clear ( ) ;
Class keyClass = job. getMapOutputKeyClass ( ) ;
Class valueClass = job. getMapOutputValueClass ( ) ;
final RawKeyValueIterator rawIter = rIter;
rIter = new RawKeyValueIterator ( ) ;
org. apache. hadoop. mapreduce. TaskAttemptContext taskContext =
new org. apache. hadoop. mapreduce. task. TaskAttemptContextImpl ( job, getTaskID ( ) , reporter) ;
org. apache. hadoop. mapreduce. Reducer< INKEY, INVALUE, OUTKEY, OUTVALUE> reducer =
( org. apache. hadoop. mapreduce. Reducer< INKEY, INVALUE, OUTKEY, OUTVALUE> )
ReflectionUtils. newInstance ( taskContext. getReducerClass ( ) , job) ;
org. apache. hadoop. mapreduce. RecordWriter< OUTKEY, OUTVALUE> trackedRW =
new NewTrackingRecordWriter < OUTKEY, OUTVALUE> ( this , taskContext) ;
org. apache. hadoop. mapreduce. Reducer. Context
reducerContext = createReduceContext ( reducer, job, getTaskID ( ) ,
rIter, reduceInputKeyCounter,
reduceInputValueCounter,
trackedRW,
committer,
reporter, comparator, keyClass,
reducer. run ( reducerContext) ;
{
while ( context. nextKey ( ) ) {
context. write ( key, new IntWritable ( count) ) ;
}
setup ( context) ;
cleanup ( context) ;
}
while ( hasMore && nextKeyIsSame) {
nextKeyValue ( ) ;
}
if ( hasMore) {
if ( inputKeyCounter != null) {
inputKeyCounter. increment ( 1 ) ;
}
return nextKeyValue ( ) ;
{
if ( ! hasMore) {
key = null;
value = null;
return false ;
}
firstValue = ! nextKeyIsSame;
DataInputBuffer nextKey = input. getKey ( ) ;
DataInputBuffer nextVal = input. getValue ( ) ;
currentRawKey. set ( nextKey. getData ( ) , nextKey. getPosition ( ) ,
nextKey. getLength ( ) - nextKey. getPosition ( ) ) ;
currentKeyLength = nextKey. getLength ( ) - nextKey. getPosition ( ) ;
currentValueLength = nextVal. getLength ( ) - nextVal. getPosition ( ) ;
hasMore = input. next ( ) ;
if ( hasMore) {
nextKey = input. getKey ( ) ;
nextKeyIsSame = comparator. compare ( currentRawKey. getBytes ( ) , 0 ,
currentRawKey. getLength ( ) ,
nextKey. getData ( ) ,
nextKey. getPosition ( ) ,
nextKey. getLength ( ) - nextKey. getPosition ( ) ) == 0 ;
}
}
} else {
return false ;
}