HDFS源码之MapReduce执行流程
1. Driver(mp程序启动driver)
public static void main ( String[ ] args) throws Exception {
args = new String [ ] { "D:\\git\\study\\BigDataPro\\hadoop\\src\\main\\resources\\input" , "D:\\git\\study\\BigDataPro\\hadoop\\src\\main\\resources\\udoutput" } ;
Configuration conf = new Configuration ( ) ;
Job job = Job. getInstance ( conf) ;
job. setJarByClass ( WholeDriver. class ) ;
job. setMapperClass ( WholeMapper. class ) ;
job. setReducerClass ( WholeReducer. class ) ;
job. setMapOutputKeyClass ( Text. class ) ;
job. setMapOutputValueClass ( BytesWritable. class ) ;
job. setOutputKeyClass ( Text. class ) ;
job. setOutputValueClass ( BytesWritable. class ) ;
FileInputFormat. setInputPaths ( job, new Path ( args[ 0 ] ) ) ;
FileOutputFormat. setOutputPath ( job, new Path ( args[ 1 ] ) ) ;
job. setInputFormatClass ( WholeFileInputformat. class ) ;
job. setOutputFormatClass ( SequenceFileOutputFormat. class ) ;
boolean result = job. waitForCompletion ( true ) ;
System. exit ( result ? 0 : 1 ) ;
}
2. waitForCompletion
public boolean waitForCompletion ( boolean verbose ) {
if ( state == JobState. DEFINE) {
submit ( ) ;
}
if ( verbose) {
monitorAndPrintJob ( ) ;
} else {
int completionPollIntervalMillis =
Job. getCompletionPollInterval ( cluster. getConf ( ) ) ;
while ( ! isComplete ( ) ) {
try {
Thread. sleep ( completionPollIntervalMillis) ;
} catch ( InterruptedException ie) {
}
}
}
return isSuccessful ( ) ;
}
3. submit
public void submit ( ) {
ensureState ( JobState. DEFINE) ;
setUseNewAPI ( ) ;
connect ( ) ;
final JobSubmitter submitter = getJobSubmitter ( cluster. getFileSystem ( ) , cluster. getClient ( ) ) ;
status = ugi. doAs ( new PrivilegedExceptionAction < JobStatus> ( ) {
public JobStatus run ( ) throws IOException, InterruptedException,
ClassNotFoundException {
return submitter. submitJobInternal ( Job. this , cluster) ;
}
} ) ;
state = JobState. RUNNING;
LOG. info ( "The url to track the job: " + getTrackingURL ( ) ) ;
}
4. submitJobInternal
JobStatus submitJobInternal ( Job job, Cluster cluster) {
copyAndConfigureFiles ( job, submitJobDir) ;
int maps = writeSplits ( job, submitJobDir) ;
writeConf ( conf, submitJobFile) ;
status = submitClient. submitJob ( jobId, submitJobDir. toString ( ) , job. getCredentials ( ) ) ;
}
5. submitJob
public org. apache. hadoop. mapreduce. JobStatus submitJob ( jobid, String jobSubmitDir,
Credentials credentials) throws IOException {
Job job = new Job ( JobID. downgrade ( jobid) , jobSubmitDir) ;
job. job. setCredentials ( credentials) ;
return job. status;
}
6. Job
public Job ( JobID jobid, String jobSubmitDir) throws IOException {
OutputStream out = localFs. create ( localJobFile) ;
try {
conf. writeXml ( out) ;
} finally {
out. close ( ) ;
}
this . start ( ) ;
}
public void run ( ) {
org. apache. hadoop. mapreduce. OutputCommitter outputCommitter = null;
try {
outputCommitter = createOutputCommitter ( conf. getUseNewMapper ( ) , jobId, conf) ;
} catch ( Exception e) {
LOG. info ( "Failed to createOutputCommitter" , e) ;
return ;
}
try {
TaskSplitMetaInfo[ ] taskSplitMetaInfos =
SplitMetaInfoReader. readSplitMetaInfo ( jobId, localFs, conf, systemJobDir) ;
int numReduceTasks = job. getNumReduceTasks ( ) ;
outputCommitter. setupJob ( jContext) ;
status. setSetupProgress ( 1.0f ) ;
Map< TaskAttemptID, MapOutputFile> mapOutputFiles =
Collections. synchronizedMap ( new HashMap < TaskAttemptID, MapOutputFile> ( ) ) ;
List< RunnableWithThrowable> mapRunnables = getMapTaskRunnables (
taskSplitMetaInfos, jobId, mapOutputFiles) ;
initCounters ( mapRunnables. size ( ) , numReduceTasks) ;
ExecutorService mapService = createMapExecutor ( ) ;
runTasks ( mapRunnables, mapService, "map" ) ;
try {
if ( numReduceTasks > 0 ) {
List< RunnableWithThrowable> reduceRunnables = getReduceTaskRunnables (
jobId, mapOutputFiles) ;
ExecutorService reduceService = createReduceExecutor ( ) ;
runTasks ( reduceRunnables, reduceService, "reduce" ) ;
}
} finally {
for ( MapOutputFile output : mapOutputFiles. values ( ) ) {
output. removeAll ( ) ;
}
}
}
7. runTasks
private void runTasks ( List< RunnableWithThrowable> runnables, ExecutorService service, String taskType) throws Exception {
for ( Runnable r : runnables) {
service. submit ( r) ;
}
}
7.1 MapTaskRunnable
public void run ( ) {
try {
TaskAttemptID mapId = new TaskAttemptID ( new TaskID (
jobId, TaskType. MAP, taskId) , 0 ) ;
LOG. info ( "Starting task: " + mapId) ;
mapIds. add ( mapId) ;
MapTask map = new MapTask ( systemJobFile. toString ( ) , mapId, taskId,
info. getSplitIndex ( ) , 1 ) ;
map. setUser ( UserGroupInformation. getCurrentUser ( ) .
getShortUserName ( ) ) ;
setupChildMapredLocalDirs ( map, localConf) ;
MapOutputFile mapOutput = new MROutputFiles ( ) ;
mapOutput. setConf ( localConf) ;
mapOutputFiles. put ( mapId, mapOutput) ;
map. setJobFile ( localJobFile. toString ( ) ) ;
localConf. setUser ( map. getUser ( ) ) ;
map. localizeConfiguration ( localConf) ;
map. setConf ( localConf) ;
try {
map_tasks. getAndIncrement ( ) ;
myMetrics. launchMap ( mapId) ;
map. run ( localConf, Job. this ) ;
myMetrics. completeMap ( mapId) ;
} finally {
map_tasks. getAndDecrement ( ) ;
}
LOG. info ( "Finishing task: " + mapId) ;
} catch ( Throwable e) {
this . storedException = e;
}
}
7.1.2 run
@Override
public void run ( final JobConf job, final TaskUmbilicalProtocol umbilical) {
this . umbilical = umbilical;
if ( useNewApi) {
runNewMapper ( job, splitMetaInfo, umbilical, reporter) ;
} else {
runOldMapper ( job, splitMetaInfo, umbilical, reporter) ;
}
done ( umbilical, reporter) ;
}
7.1.3 runNewMapper
private < INKEY, INVALUE, OUTKEY, OUTVALUE> void runNewMapper ( final JobConf job, final TaskSplitIndex splitIndex, final TaskUmbilicalProtocol umbilical, TaskReporter reporter) throws IOException, ClassNotFoundException, InterruptedException {
org. apache. hadoop. mapreduce. TaskAttemptContext taskContext =
new org. apache. hadoop. mapreduce. task. TaskAttemptContextImpl ( job,
getTaskID ( ) ,
reporter) ;
org. apache. hadoop. mapreduce. Mapper< INKEY, INVALUE, OUTKEY, OUTVALUE> mapper =
( org. apache. hadoop. mapreduce. Mapper< INKEY, INVALUE, OUTKEY, OUTVALUE> )
ReflectionUtils. newInstance ( taskContext. getMapperClass ( ) , job) ;
org. apache. hadoop. mapreduce. InputFormat< INKEY, INVALUE> inputFormat =
( org. apache. hadoop. mapreduce. InputFormat< INKEY, INVALUE> )
ReflectionUtils. newInstance ( taskContext. getInputFormatClass ( ) , job) ;
org. apache. hadoop. mapreduce. InputSplit split = null;
split = getSplitDetails ( new Path ( splitIndex. getSplitLocation ( ) ) ,
splitIndex. getStartOffset ( ) ) ;
org. apache. hadoop. mapreduce. RecordReader< INKEY, INVALUE> input =
new NewTrackingRecordReader < INKEY, INVALUE>
( split, inputFormat, reporter, taskContext) ;
job. setBoolean ( JobContext. SKIP_RECORDS, isSkipping ( ) ) ;
org. apache. hadoop. mapreduce. RecordWriter output = null;
if ( job. getNumReduceTasks ( ) == 0 ) {
output =
new NewDirectOutputCollector ( taskContext, job, umbilical, reporter) ;
} else {
output = new NewOutputCollector ( taskContext, job, umbilical, reporter) ;
}
org. apache. hadoop. mapreduce. MapContext< INKEY, INVALUE, OUTKEY, OUTVALUE>
mapContext =
new MapContextImpl < INKEY, INVALUE, OUTKEY, OUTVALUE> ( job, getTaskID ( ) ,
input, output,
committer,
reporter, split) ;
org. apache. hadoop. mapreduce. Mapper< INKEY, INVALUE, OUTKEY, OUTVALUE> . Context
mapperContext =
new WrappedMapper < INKEY, INVALUE, OUTKEY, OUTVALUE> ( ) . getMapContext (
mapContext) ;
try {
input. initialize ( split, mapperContext) ;
mapper. run ( mapperContext) ;
mapPhase. complete ( ) ;
setPhase ( TaskStatus. Phase. SORT) ;
statusUpdate ( umbilical) ;
input. close ( ) ;
input = null;
output. close ( mapperContext) ;
output = null;
} finally {
closeQuietly ( input) ;
closeQuietly ( output, mapperContext) ;
}
}
7.1.3.1 NewOutputCollector
NewOutputCollector ( JobContext jobContext, JobConf job, TaskUmbilicalProtocol umbilical, TaskReporter reporter) throws IOException, ClassNotFoundException {
collector = createSortingCollector ( job, reporter) ;
partitions = jobContext. getNumReduceTasks ( ) ;
if ( partitions > 1 ) {
partitioner = ( org. apache. hadoop. mapreduce. Partitioner< K, V> )
ReflectionUtils. newInstance ( jobContext. getPartitionerClass ( ) , job) ;
} else {
partitioner = new org. apache. hadoop. mapreduce. Partitioner < K, V> ( ) {
@Override
public int getPartition ( K key, V value, int numPartitions) {
return partitions - 1 ;
}
} ;
}
}
7.1.3.2 run
public void run ( Context context) throws IOException, InterruptedException {
setup ( context) ;
try {
while ( context. nextKeyValue ( ) ) {
map ( context. getCurrentKey ( ) , context. getCurrentValue ( ) , context) ;
}
} finally {
cleanup ( context) ;
}
}
7.1.3.2.1 context
public class Context extends Mapper < KEYIN, VALUEIN, KEYOUT, VALUEOUT> . Context {
protected MapContext< KEYIN, VALUEIN, KEYOUT, VALUEOUT> mapContext;
public Context ( MapContext< KEYIN, VALUEIN, KEYOUT, VALUEOUT> mapContext) {
this . mapContext = mapContext;
}
public InputSplit getInputSplit ( ) {
return mapContext. getInputSplit ( ) ;
}
@Override
public KEYIN getCurrentKey ( ) throws IOException, InterruptedException {
return mapContext. getCurrentKey ( ) ;
}
@Override
public VALUEIN getCurrentValue ( ) throws IOException, InterruptedException {
return mapContext. getCurrentValue ( ) ;
}
@Override
public boolean nextKeyValue ( ) throws IOException, InterruptedException {
return mapContext. nextKeyValue ( ) ;
}
@Override
public void write ( KEYOUT key, VALUEOUT value) throws IOException,
InterruptedException {
mapContext. write ( key, value) ;
}
}
7.1.3.2.2 write
@Override
public void write ( K key, V value) throws IOException, InterruptedException {
collector. collect ( key, value, partitioner. getPartition ( key, value, partitions) ) ;
}
@Override
public void close ( TaskAttemptContext context) throws IOException, InterruptedException {
try {
collector. flush ( ) ;
} catch ( ClassNotFoundException cnf) {
throw new IOException ( "can't find class " , cnf) ;
}
collector. close ( ) ;
}
7.1.3.2.3 collect
public synchronized void collect ( K key, V value, final int partition) throws IOException {
if ( bufferRemaining <= 0 ) {
spillLock. lock ( ) ;
try {
do {
if ( ! spillInProgress) {
final int kvbidx = 4 * kvindex;
final int kvbend = 4 * kvend;
final int bUsed = distanceTo ( kvbidx, bufindex) ;
final boolean bufsoftlimit = bUsed >= softLimit;
if ( ( kvbend + METASIZE) % kvbuffer. length !=
equator - ( equator % METASIZE) ) {
resetSpill ( ) ;
bufferRemaining = Math. min (
distanceTo ( bufindex, kvbidx) - 2 * METASIZE,
softLimit - bUsed) - METASIZE;
continue ;
} else if ( bufsoftlimit && kvindex != kvend) {
startSpill ( ) ;
final int avgRec = ( int )
( mapOutputByteCounter. getCounter ( ) /
mapOutputRecordCounter. getCounter ( ) ) ;
final int distkvi = distanceTo ( bufindex, kvbidx) ;
final int newPos = ( bufindex +
Math. max ( 2 * METASIZE - 1 ,
Math. min ( distkvi / 2 ,
distkvi / ( METASIZE + avgRec) * METASIZE) ) )
% kvbuffer. length;
setEquator ( newPos) ;
bufmark = bufindex = newPos;
final int serBound = 4 * kvend
bufferRemaining = Math. min (
distanceTo ( bufend, newPos) ,
Math. min (
distanceTo ( newPos, serBound) ,
softLimit) ) - 2 * METASIZE;
}
}
} while ( false ) ;
} finally {
spillLock. unlock ( ) ;
}
}
}
7.1.3.2.4 flush
public void flush ( ) throws IOException, ClassNotFoundException, InterruptedException {
sortAndSpill ( ) ;
mergeParts ( ) ;
}
7.2 ReduceTaskRunnable
public void run ( ) {
try {
reduce. run ( localConf, Job. this ) ;
}
7.2.1 run
public void run ( JobConf job, final TaskUmbilicalProtocol umbilical) {
job. setBoolean ( JobContext. SKIP_RECORDS, isSkipping ( ) ) ;
if ( isMapOrReduce ( ) ) {
copyPhase = getProgress ( ) . addPhase ( "copy" ) ;
sortPhase = getProgress ( ) . addPhase ( "sort" ) ;
reducePhase = getProgress ( ) . addPhase ( "reduce" ) ;
}
TaskReporter reporter = startReporter ( umbilical) ;
boolean useNewApi = job. getUseNewReducer ( ) ;
initialize ( job, getJobID ( ) , reporter, useNewApi) ;
if ( jobCleanup) {
runJobCleanupTask ( umbilical, reporter) ;
return ;
}
if ( jobSetup) {
runJobSetupTask ( umbilical, reporter) ;
return ;
}
if ( taskCleanup) {
runTaskCleanupTask ( umbilical, reporter) ;
return ;
}
codec = initCodec ( ) ;
RawKeyValueIterator rIter = null;
ShuffleConsumerPlugin shuffleConsumerPlugin = null;
Class combinerClass = conf. getCombinerClass ( ) ;
CombineOutputCollector combineCollector =
( null != combinerClass) ?
new CombineOutputCollector ( reduceCombineOutputCounter, reporter, conf) : null;
Class< ? extends ShuffleConsumerPlugin > clazz =
job. getClass ( MRConfig. SHUFFLE_CONSUMER_PLUGIN, Shuffle. class , ShuffleConsumerPlugin. class ) ;
shuffleConsumerPlugin = ReflectionUtils. newInstance ( clazz, job) ;
LOG. info ( "Using ShuffleConsumerPlugin: " + shuffleConsumerPlugin) ;
ShuffleConsumerPlugin. Context shuffleContext =
new ShuffleConsumerPlugin. Context ( getTaskID ( ) , job, FileSystem. getLocal ( job) , umbilical,
super . lDirAlloc, reporter, codec,
combinerClass, combineCollector,
spilledRecordsCounter, reduceCombineInputCounter,
shuffledMapsCounter,
reduceShuffleBytes, failedShuffleCounter,
mergedMapOutputsCounter,
taskStatus, copyPhase, sortPhase, this ,
mapOutputFile, localMapFiles) ;
shuffleConsumerPlugin. init ( shuffleContext) ;
rIter = shuffleConsumerPlugin. run ( ) ;
mapOutputFilesOnDisk. clear ( ) ;
sortPhase. complete ( ) ;
setPhase ( TaskStatus. Phase. REDUCE) ;
statusUpdate ( umbilical) ;
Class keyClass = job. getMapOutputKeyClass ( ) ;
Class valueClass = job. getMapOutputValueClass ( ) ;
RawComparator comparator = job. getOutputValueGroupingComparator ( ) ;
if ( useNewApi) {
runNewReducer ( job, umbilical, reporter, rIter, comparator, keyClass, valueClass) ;
} else {
runOldReducer ( job, umbilical, reporter, rIter, comparator, keyClass, valueClass) ;
}
shuffleConsumerPlugin. close ( ) ;
done ( umbilical, reporter) ;
}
7.2.2 runNewReducer
private < INKEY, INVALUE, OUTKEY, OUTVALUE> void runNewReducer ( JobConf job,
final TaskUmbilicalProtocol umbilical,
final TaskReporter reporter,
RawKeyValueIterator rIter,
RawComparator< INKEY> comparator,
Class< INKEY> keyClass,
Class< INVALUE> valueClass
) throws IOException, InterruptedException,
ClassNotFoundException {
org. apache. hadoop. mapreduce. TaskAttemptContext taskContext =
new org. apache. hadoop. mapreduce. task. TaskAttemptContextImpl ( job,
getTaskID ( ) , reporter) ;
org. apache. hadoop. mapreduce. Reducer< INKEY, INVALUE, OUTKEY, OUTVALUE> reducer =
( org. apache. hadoop. mapreduce. Reducer< INKEY, INVALUE, OUTKEY, OUTVALUE> )
ReflectionUtils. newInstance ( taskContext. getReducerClass ( ) , job) ;
org. apache. hadoop. mapreduce. RecordWriter< OUTKEY, OUTVALUE> trackedRW =
new NewTrackingRecordWriter < OUTKEY, OUTVALUE> ( this , taskContext) ;
job. setBoolean ( "mapred.skip.on" , isSkipping ( ) ) ;
job. setBoolean ( JobContext. SKIP_RECORDS, isSkipping ( ) ) ;
org. apache. hadoop. mapreduce. Reducer. Context
reducerContext = createReduceContext ( reducer, job, getTaskID ( ) ,
rIter, reduceInputKeyCounter,
reduceInputValueCounter,
trackedRW,
committer,
reporter, comparator, keyClass,
valueClass) ;
try {
reducer. run ( reducerContext) ;
} finally {
trackedRW. close ( reducerContext) ;
}
}
7.2.2.1 run
public void run ( Context context) throws IOException, InterruptedException {
setup ( context) ;
try {
while ( context. nextKey ( ) ) {
reduce ( context. getCurrentKey ( ) , context. getValues ( ) , context) ;
Iterator< VALUEIN> iter = context. getValues ( ) . iterator ( ) ;
if ( iter instanceof ReduceContext. ValueIterator ) {
( ( ReduceContext. ValueIterator< VALUEIN> ) iter) . resetBackupStore ( ) ;
}
}
} finally {
cleanup ( context) ;
}
}
public class MR2_Reducer extends Reducer < Text, MR2_Writable, Text, MR2_Writable> {
@Override
protected void reduce ( Text key, Iterable< MR2_Writable> values, Context context) throws IOException, InterruptedException {
long sum_upFlow = 0 ;
long sum_downFlow = 0 ;
for ( MR2_Writable flowBean : values) {
sum_upFlow += flowBean. getUpFlow ( ) ;
sum_downFlow += flowBean. getDownFlow ( ) ;
}
MR2_Writable resultBean = new MR2_Writable ( sum_upFlow, sum_downFlow) ;
context. write ( key, resultBean) ;
}
}
7.2.2.2 write
private RecordWriter< KEYOUT, VALUEOUT> output;
public TaskInputOutputContextImpl ( Configuration conf, TaskAttemptID taskid,
RecordWriter< KEYOUT, VALUEOUT> output,
OutputCommitter committer,
StatusReporter reporter) {
super ( conf, taskid, reporter) ;
this . output = output;
this . committer = committer;
}
public void write ( KEYOUT key, VALUEOUT value) throws IOException, InterruptedException {
output. write ( key, value) ;
}
7.2.2.2.1 NewTrackingRecordWriter
NewTrackingRecordWriter ( ReduceTask reduce, TaskAttemptContext taskContext) {
this . outputRecordCounter = reduce. reduceOutputCounter;
this . fileOutputByteCounter = reduce. fileOutputByteCounter;
List< Statistics> matchedStats = null;
if ( reduce. outputFormat instanceof org. apache. hadoop. mapreduce. lib. output. FileOutputFormat ) {
matchedStats = getFsStatistics ( org. apache. hadoop. mapreduce. lib. output. FileOutputFormat
. getOutputPath ( taskContext) , taskContext. getConfiguration ( ) ) ;
}
fsStats = matchedStats;
long bytesOutPrev = getOutputBytes ( fsStats) ;
this . real = ( org. apache. hadoop. mapreduce. RecordWriter< K, V> ) reduce. outputFormat
. getRecordWriter ( taskContext) ;
long bytesOutCurr = getOutputBytes ( fsStats) ;
fileOutputByteCounter. increment ( bytesOutCurr - bytesOutPrev) ;
}