8.MapReduce源码解析（超详细）

爱慕。

已于 2022-09-06 14:19:37 修改

阅读量1.2k

点赞数

分类专栏： Hadoop 文章标签： mapreduce hadoop 大数据

于 2022-09-06 14:11:23 首次发布

本文链接：https://blog.csdn.net/weixin_48171883/article/details/126724098

版权

Hadoop 专栏收录该内容

4 篇文章 1 订阅

订阅专栏

MapReduce源码解析

1. Split

job.waitForCompletion(true);

org.apache.hadoop.mapreduce.Job#waitForCompletion

//Submit the job to the cluster and wait for it to finish.
//判断当前的状态
if (state == JobState.DEFINE) {
//------------------------------------------------------------------
------------关键代码
submit();
}
//监控任务的运行状态
if (verbose) {
//Monitor a job and print status in real-time as progress is made
and tasks fail.
monitorAndPrintJob();
}
//返回任务状态
return isSuccessful();

org.apache.hadoop.mapreduce.Job#submit

//确认当前任务的状态
ensureState(JobState.DEFINE);
//mapreduce1.x和2.x,但是2的时候将1的好多方法进行了优化
setUseNewAPI();
//获取当前任务所运行的集群
connect();
//Provides a way to access information about the map/reduce cluster.
cluster = new Cluster(getConfiguration());
//创建Job的提交器
final JobSubmitter submitter = getJobSubmitter(cluster.getFileSystem(),
cluster.getClient());
//提交任务到系统去执行
//----------------------------------------------------------------------
--------关键代码
//Internal method for submitting jobs to the system
status = submitter.submitJobInternal(Job.this, cluster)
//任务的状态修改为运行
state = JobState.RUNNING;

org.apache.hadoop.mapreduce.JobSubmitter#submitJobInternal

//验证job输出
checkSpecs(job);
//生成并设置新的JobId
JobID jobId = submitClient.getNewJobID();
job.setJobID(jobId);
//获取任务的提交目录
Path submitJobDir = new Path(jobStagingArea, jobId.toString());
//----------------------------------------------------------------------
--------关键代码
// Create the splits for the job 197行
int maps = writeSplits(job, submitJobDir);
//设置map的数量，其中map的数量就等于切片的数量
conf.setInt(MRJobConfig.NUM_MAPS, maps)

org.apache.hadoop.mapreduce.JobSubmitter#writeSplits

//----------------------------------------------------------------------
--------关键代码
//使用新API
maps = writeNewSplits(job, jobSubmitDir);

org.apache.hadoop.mapreduce.JobSubmitter#writeNewSplits

//获取配置文件
Configuration conf = job.getConfiguration();
//----------------------------------------------------------------------
--------关键代码InputFormat
//获取文件读取器 org.apache.hadoop.mapreduce.lib.input.TextInputFormat
InputFormat<?, ?> input =
ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
//----------------------------------------------------------------------
--------关键代码getSplits
List<InputSplit> splits = input.getSplits(job);
//将List转成数组
T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);
// sort the splits into order based on size, so that the biggest
Arrays.sort(array, new SplitComparator());
//任务创建切片文件
JobSplitWriter.createSplitFiles(jobSubmitDir,
conf,jobSubmitDir.getFileSystem(conf), array);
//返回切片的数目
return array.length;

org.apache.hadoop.mapreduce.task.JobContextImpl#getInputFormatClass

//返回创建的TextInputFormat对象
return (Class<? extends InputFormat<?,?>>)
conf.getClass(INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class);
//getClass的操作是如果有值返回值，没有的话使用默认值
getClass(String name, Class<?> defaultValue)

org.apache.hadoop.mapreduce.lib.input.FileInputFormat#getSplits
public class TextInputFormat extends FileInputFormat<LongWritable, Text>

//Generate the list of files and make them into FileSplits.
//Math.max(1,1)
//getFormatMinSplitSize()一个切片最少应该拥有1个字节
//getMinSplitSize(job) 读取程序员设置的切片的最小值，如果没有设置默认读取1
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
//读取程序员设置的切片的最大值，如果没有设置默认读取Long.MAX_VALUE
long maxSize = getMaxSplitSize(job);
//创建一个List存放切片
List<InputSplit> splits = new ArrayList<InputSplit>();
//获取要分析的文件列表
List<FileStatus> files = listStatus(job);
//开始遍历要分析文件的路径
for (FileStatus file : files) {
//获取文件路径
Path path = file.getPath();
//获取文件的长度，文件拥有的字节数
long length = file.getLen();
//如果文件长度不为0
if (length != 0) {
//获取文件对应的Blocks信息
BlockLocation[] blkLocations;
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus)
file).getBlockLocations();
} else {
FileSystem fs = path.getFileSystem(job.getConfiguration());
blkLocations = fs.getFileBlockLocations(file, 0, length);
}
//判断文件是否可以进行切片
if (isSplitable(job, path)) {
//获取Block的大小
long blockSize = file.getBlockSize();
//切片的默认大小为 128M
//blockSize 128M, minSize 1byte, maxSize
long.Max_ValueBytes
//return Math.max(minSize, Math.min(maxSize, blockSize));
//minSize 64M ----> 128M
//minSize 256M ----> 256M
//maxSize 64M ----> 64M
//maxSize 256M ---->128M
long splitSize = computeSplitSize(blockSize, minSize,
maxSize);
//声明一个变量存放字节的余额 256M
long bytesRemaining = length;
//查看剩余的容量是否能达到阈值 SPLIT_SLOP
//private static final double SPLIT_SLOP = 1.1
while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
int blkIndex = getBlockIndex(blkLocations, length -
bytesRemaining);
//这个方法工厂专门用来创建切片
//切片生成之后添加到List
//org.apache.hadoop.mapreduce.lib.input.FileInputFormat#makeSplit
splits.add(makeSplit(path, length - bytesRemaining,
splitSize,
blkLocations[blkIndex].getHosts(),
blkLocations[blkIndex].getCachedHosts()));
//每次创建切片后，将使用的部分删除
bytesRemaining -= splitSize;
}
//判断剩余的容量是否为0
//最后一个切片的数据范围是(0,1 , 1.1]
if (bytesRemaining != 0) {
int blkIndex = getBlockIndex(blkLocations, length -
bytesRemaining);
splits.add(makeSplit(path, length - bytesRemaining,
bytesRemaining,
blkLocations[blkIndex].getHosts(),
blkLocations[blkIndex].getCachedHosts()));
}
} else { // not splitable
//如果发现文件不能切片，将整个文件作为一个切片
splits.add(makeSplit(path, 0, length,
blkLocations[0].getHosts(),
blkLocations[0].getCachedHosts()));
}
} else {
//Create empty hosts array for zero length files
splits.add(makeSplit(path, 0, length, new String[0]));
}
}
// Save the number of input files for metrics/loadgen
job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
//返回切片的容器
return splits;

2.MapTask

org.apache.hadoop.mapred.MapTask#run

//使用新的API
boolean useNewApi = job.getUseNewMapper();
//----------------------------------------------------------------------
--------关键代码initialize
//初始化MapTask
initialize(job, getJobID(), reporter, useNewApi);
//----------------------------------------------------------------------
--------关键代码runNewMapper
//开始运行Task
runNewMapper(job, splitMetaInfo, umbilical, reporter);

org.apache.hadoop.mapred.Task#initialize

//JOB的上下文参数
jobContext = new JobContextImpl(job, id, reporter);
//Map的上下文参数
taskContext = new TaskAttemptContextImpl(job, taskId, reporter);
//创建Map数据的写出器
outputFormat =
ReflectionUtils.newInstance(taskContext.getOutputFormatClass(), job);
//真正的写出对象
org.apache.hadoop.mapreduce.task.JobContextImpl#getOutputFormatClass
return (Class<? extends OutputFormat<?,?>>)
conf.getClass(OUTPUT_FORMAT_CLASS_ATTR,
TextOutputFormat.class);
//创建Map任务的提交器
committer = outputFormat.getOutputCommitter(taskContext);
//真正的提交器对象
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat#getOutputCommitt
er
committer = new FileOutputCommitter(output, context);
//获取写出的路径
Path outputPath = FileOutputFormat.getOutputPath(conf);

org.apache.hadoop.mapred.MapTask#runNewMapper

// make a task context so we can get the classes
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,
getTaskID(),reporter);
// make a mapper--com.yjx.wordcount.WordCountMapper
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper
=
(org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
// make the input format--
org.apache.hadoop.mapreduce.lib.input.TextInputFormat
org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
(org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
// rebuild the input split
org.apache.hadoop.mapreduce.InputSplit split = null;
split = getSplitDetails(new
Path(splitIndex.getSplitLocation()),splitIndex.getStartOffset());
// 创建记录读取器
org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
new NewTrackingRecordReader<INKEY,INVALUE>(split, inputFormat,
reporter, taskContext);
//创建真正的读取器
//org.apache.hadoop.mapred.MapTask.NewTrackingRecordReader#NewTrackingRe
cordReader
this.real = inputFormat.createRecordReader(split, taskContext);
//使用inputFormat创建读取器
//org.apache.hadoop.mapreduce.lib.input.TextInputFormat#createRecordRead
er
return new LineRecordReader(recordDelimiterBytes);
// 创建记录写出器
org.apache.hadoop.mapreduce.RecordWriter output = null;
output = new NewOutputCollector(taskContext, job, umbilical, reporter);
// 创建Map的上下文对象
org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE>
mapContext =
new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job,
getTaskID(),
input, output,
committer,
reporter, split);
// 创建mapContext的包装类
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Contex
t
mapperContext =
new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>
().getMapContext(
mapContext);
// 初始化切片信息
input.initialize(split, mapperContext);
//开始执行Mapper方法，就是自己的Mapper实现类
mapper.run(mapperContext);
mapPhase.complete();
setPhase(TaskStatus.Phase.SORT);
statusUpdate(umbilical);
//关闭输入
input.close();
input = null;
//关闭输出（将缓冲区最后的数据写出，并合并这些文件）
output.close(mapperContext);
output = null;

org.apache.hadoop.mapred.MapTask.NewTrackingRecordReader#initialize

//LineRecordReader执行初始化
real.initialize(split, context);

org.apache.hadoop.mapreduce.lib.input.LineRecordReader#initialize

//获取切片
FileSplit split = (FileSplit) genericSplit;
//配置信息
Configuration job = context.getConfiguration();
//一行最多读取的数据量
this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
//获取切片的开始和结束偏移量
start = split.getStart();
end = start + split.getLength();
//获取文件路径
final Path file = split.getPath();
// open the file and seek to the start of the split
final FileSystem fs = file.getFileSystem(job);
fileIn = fs.open(file);
//将读取器定位到切片的开始位置
fileIn.seek(start);
//创建输入流
in = new UncompressedSplitLineReader(fileIn, job,
this.recordDelimiterBytes, split.getLength());
filePosition = fileIn;
// If this is not the first split, we always throw away first record
// because we always (except the last split) read one extra line in
// next() method.
if (start != 0) {
start += in.readLine(new Text(), 0, maxBytesToConsume(start));
}
this.pos = start;

org.apache.hadoop.mapreduce.Mapper#run

//初始化
setup(context);
try {
//1判断是否为最后一行2为key设置值3为value设置值
while (context.nextKeyValue()) {
//三个参数分别为：key value
map(context.getCurrentKey(), context.getCurrentValue(),
context);
}
} finally {
//清空操作
cleanup(context);
}

org.apache.hadoop.mapreduce.lib.input.LineRecordReader#nextKeyValue

//偏移量
key = new LongWritable();
//设置本次读取的开始位置
key.set(pos);
//一行数据
value = new Text();
//We always read one extra line 读取一行的数据
if (pos == 0) {
newSize = skipUtfByteOrderMark();
} else {
newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));
//下次读取数据的位置
pos += newSize;
}

org.apache.hadoop.mapreduce.lib.input.LineRecordReader#skipUtfByteOrderMark

//每次空读一行数据，绕过第一行代码
int newSize = in.readLine(value, newMaxLineLength,
maxBytesToConsume(pos));
pos += newSize;

3.KvBuffer

org.apache.hadoop.mapred.MapTask.NewOutputCollector#NewOutputCollector

//创建收集器
collector = createSortingCollector(job, reporter);
//获取reduce的数量
partitions = jobContext.getNumReduceTasks();
if (partitions > 1) {
partitioner = (org.apache.hadoop.mapreduce.Partitioner<K, V>)
ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
} else {
partitioner = new org.apache.hadoop.mapreduce.Partitioner<K, V>() {
@Override
public int getPartition(K key, V value, int numPartitions) {
    return partitions - 1;
}
};
}

org.apache.hadoop.mapred.MapTask#createSortingCollector

//获取上下文对象
MapOutputCollector.Context context = new
MapOutputCollector.Context(this, job, reporter);
//获取收集器的Class
Class<?>[] collectorClasses = job.getClasses(
JobContext.MAP_OUTPUT_COLLECTOR_CLASS_ATTR,
MapOutputBuffer.class);
//获取MapOutputCollector的自雷
Class<? extends MapOutputCollector> subclazz =
clazz.asSubclass(MapOutputCollector.class);
//通过反射创建一个收集器--org.apache.hadoop.mapred.MapTask.MapOutputBuffer
MapOutputCollector<KEY, VALUE> collector =
ReflectionUtils.newInstance(subclazz, job);
//执行初始化操作
collector.init(context);
//最终将创建的写出器返回
return collector;

org.apache.hadoop.mapred.MapTask.MapOutputBuffer#init

//获取溢写的阈值
final float spillper = job.getFloat(JobContext.MAP_SORT_SPILL_PERCENT,
(float)0.8);
//缓冲区数据的大小100M
final int sortmb = job.getInt(JobContext.IO_SORT_MB, 100);
//数据的大小 1024*1024
indexCacheMemoryLimit = job.getInt(JobContext.INDEX_CACHE_MEMORY_LIMIT,
INDEX_CACHE_MEMORY_LIMIT_DEFAULT)
//获取排序器--快速排序
sorter = ReflectionUtils.newInstance(job.getClass("map.sort.class",
QuickSort.class, IndexedSorter.class), job);
//设置容量 100M
int maxMemUsage = sortmb << 20;
//结果肯定是16的整数倍
maxMemUsage -= maxMemUsage % METASIZE;
//缓冲区
kvbuffer = new byte[maxMemUsage];
//kvbuffer开始进行初始化
bufvoid = kvbuffer.length;
kvmeta =
ByteBuffer.wrap(kvbuffer).order(ByteOrder.nativeOrder()).asIntBuffer();
setEquator(0);
bufstart = bufend = bufindex = equator;
kvstart = kvend = kvindex;
maxRec = kvmeta.capacity() / NMETA;
softLimit = (int)(kvbuffer.length * spillper);
bufferRemaining = softLimit;
//获取比较器
comparator = job.getOutputKeyComparator();
//获取Map的key和value输出类型
keyClass = (Class<K>)job.getMapOutputKeyClass();
valClass = (Class<V>)job.getMapOutputValueClass();
//序列化Key和Value
keySerializer = serializationFactory.getSerializer(keyClass);
keySerializer.open(bb);
valSerializer = serializationFactory.getSerializer(valClass);
valSerializer.open(bb);
//创建溢写线程，并让溢写线程处于等待，当达到阈值的时候开始溢写
spillInProgress = false;
minSpillsForCombine = job.getInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);
spillThread.setDaemon(true);
spillThread.setName("SpillThread");
spillLock.lock();
try {
spillThread.start();
while (!spillThreadRunning) {
spillDone.await();
}
}

org.apache.hadoop.mapred.JobConf#getOutputKeyComparator

//获取比较器
Class<? extends RawComparator> ts = getClass(JobContext.KEY_COMPARATOR,
null, RawComparator.class);
//如果自定义了比较器，创建自定义比较器对象
if (ts != null)
return ReflectionUtils.newInstance(ts, this);
//如果没有创建比较器
return
WritableComparator.get(getMapOutputKeyClass().asSubclass(WritableCompara
ble.class), this);
//默认的比较器对象--org.apache.hadoop.io.WritableComparator
comparator = new WritableComparator(c, conf, true);

org.apache.hadoop.mapreduce.task.JobContextImpl#getPartitionerClass

//创建分区器
return (Class<? extends Partitioner<?,?>>)
conf.getClass(PARTITIONER_CLASS_ATTR, HashPartitioner.class);
//分区器具体执行的代码
//org.apache.hadoop.mapreduce.lib.partition.HashPartitioner#getPartition
return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;

4.Spill

org.apache.hadoop.mapred.MapTask.NewOutputCollector#write

//开始收集数据
collector.collect(key, value, partitioner.getPartition(key, value,
partitions));

org.apache.hadoop.mapred.MapTask.MapOutputBuffer#collect

//元数据存储区
bufferRemaining -= METASIZE;
//判断是否需要进行溢写，如果需要进行准备工作
//如果需要溢写唤醒SpillThread线程，调用run方法，开始SortAndSpill
org.apache.hadoop.mapred.MapTask.MapOutputBuffer#sortAndSpill
//如果不满足将数据存储到KvBuffer

5.Merge

org.apache.hadoop.mapred.MapTask.MapOutputBuffer#flush

//将缓冲区中不满80%的数据也写出到硬盘
sortAndSpill();
//合并曾经溢写出的数据块
mergeParts();
//当前Map准备好进入到下一个阶段
sortPhase.startNextPhase();

6. ReduceTask

org.apache.hadoop.mapred.ReduceTask#run

//进行初始化操作
initialize(job, getJobID(), reporter, useNewApi);
//获取Key和Value的迭代器
RawKeyValueIterator rIter = null;
//创建一个
Class combinerClass = conf.getCombinerClass();
CombineOutputCollector combineCollector =
(null != combinerClass) ?
new
CombineOutputCollector(reduceCombineOutputCounter, reporter, conf) :
null;
//创建一个Shuffer
Class<? extends ShuffleConsumerPlugin> clazz =
job.getClass(MRConfig.SHUFFLE_CONSUMER_PLUGIN,
Shuffle.class,ShuffleConsumerPlugin.class);
shuffleConsumerPlugin = ReflectionUtils.newInstance(clazz, job);
//创建一个上下文对象，并且对Shuffer进行初始化
ShuffleConsumerPlugin.Context shuffleContext =
new ShuffleConsumerPlugin.Context(getTaskID(), job,
FileSystem.getLocal(job), umbilical,
super.lDirAlloc, reporter, codec,
combinerClass, combineCollector,
spilledRecordsCounter,
reduceCombineInputCounter,
shuffledMapsCounter,
reduceShuffleBytes, failedShuffleCounter,
mergedMapOutputsCounter,
taskStatus, copyPhase, sortPhase, this,
mapOutputFile, localMapFiles);
//已经初始化了合并器
shuffleConsumerPlugin.init(shuffleContext);
//执行Shuffer，并且返回key value的迭代器---MergeQueue
rIter = shuffleConsumerPlugin.run();
//获取Key的输出类型
Class keyClass = job.getMapOutputKeyClass();
Class valueClass = job.getMapOutputValueClass();
//获取分组比较器（reduce阶段优先使用分组比较器，如果没有设置就使用原来的比较器）
RawComparator comparator = job.getOutputValueGroupingComparator();
//开始执行Reduce任务
runNewReducer(job, umbilical, reporter, rIter, comparator,keyClass,
valueClass);

org.apache.hadoop.mapred.Task#initialize

//获取Job和Reduce的Context
jobContext = new JobContextImpl(job, id, reporter);
taskContext = new TaskAttemptContextImpl(job, taskId, reporter);
//返回数据的写出对象----
org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
outputFormat =
ReflectionUtils.newInstance(taskContext.getOutputFormatClass(), job);
//创建提交对象----
org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
committer = outputFormat.getOutputCommitter(taskContext);
//获取数据输出的路径
Path outputPath = FileOutputFormat.getOutputPath(conf);

org.apache.hadoop.mapreduce.task.reduce.Shuffle#run

// Start the map-completion events fetcher thread
final EventFetcher<K, V> eventFetcher =
new EventFetcher<K, V>(reduceId, umbilical, scheduler,
this,maxEventsToFetch);
eventFetcher.start();
//判断map和reduce是否在一个节点
boolean isLocal = localMapFiles != null;
//开启拉取的线程数，本地为1，其他节点为5
final int numFetchers = isLocal ? 1
:jobConf.getInt(MRJobConfig.SHUFFLE_PARALLEL_COPIES, 5);
Fetcher<K, V>[] fetchers = new Fetcher[numFetchers];
//开始去拉取数据
fetchers[0].start();
//关闭拉取事件
eventFetcher.shutDown();
// Stop the map-output fetcher threads
for (Fetcher<K, V> fetcher : fetchers) {
fetcher.shutDown();
}
//开始获取KeyValue的迭代器
RawKeyValueIterator kvIter = merger.close();
return kvIter;

org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl#close

//返回最后一次合并的数据的迭代器
return finalMerge(jobConf, rfs, memory, disk);

org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl#finalMerge

//获取map输出数据的类型
Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass();
Class<V> valueClass = (Class<V>)job.getMapOutputValueClass();
//获取一个比较器 ----org.apache.hadoop.io.WritableComparator
final RawComparator<K> comparator =
(RawComparator<K>)job.getOutputKeyComparator();
//返回key迭代器----org.apache.hadoop.mapred.Merger.MergeQueue
final RawKeyValueIterator rIter = Merger.merge(job, fs,
keyClass, valueClass, memDiskSegments,
numMemDiskSegments,
tmpDir, comparator, reporter, spilledRecordsCounter,
null,
mergePhase);

org.apache.hadoop.mapred.ReduceTask#runNewReducer

//获取迭代器
final RawKeyValueIterator rawIter = rIter;
//使用匿名内部类创建一个新的对象
rIter = new RawKeyValueIterator() {
public void close() throws IOException {
rawIter.close();
}
public DataInputBuffer getKey() throws IOException {
return rawIter.getKey();
}
public Progress getProgress() {
return rawIter.getProgress();
}
public DataInputBuffer getValue() throws IOException {
return rawIter.getValue();
}
public boolean next() throws IOException {
boolean ret = rawIter.next();
reporter.setProgress(rawIter.getProgress().getProgress());
return ret;
}
};
//本次任务的上下文对象
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
new
org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,getTaskID(),
reporter);
//本次要执行的Reducer --com.yjx.WordCountReducer
org.apache.hadoop.mapreduce.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>
reducer =
(org.apache.hadoop.mapreduce.Reducer<INKEY, INVALUE,
OUTKEY, OUTVALUE>)
ReflectionUtils.newInstance(taskContext.getReducerClass(), job);
//数据的写出器----
org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter
org.apache.hadoop.mapreduce.RecordWriter<OUTKEY, OUTVALUE> trackedRW =
new NewTrackingRecordWriter<OUTKEY, OUTVALUE>(this,
taskContext);
//创建Reduce的上下文对象
org.apache.hadoop.mapreduce.Reducer.Context
reducerContext = createReduceContext(reducer, job,
getTaskID(),
rIter, reduceInputKeyCounter,
reduceInputValueCounter,
trackedRW,
committer,
reporter, comparator, keyClass,
valueClass);
//开始执行reduce任务
reducer.run(reducerContext);

org.apache.hadoop.mapreduce.Reducer#run

//判断是否还有数据可以读取，相同的key只会执行一次（hello hello hello hi hi hi 2
次）
while (context.nextKey()) {
//context.getValues()-->private ValueIterable iterable = new
ValueIterable();
//values.iterator -->private ValueIterator iterator = new
ValueIterator();
//iterator.hasNext -->return firstValue || nextKeyIsSame;
//iterator.next-->(firstValue?value:nextKeyValue())
//
reduce(context.getCurrentKey(), context.getValues(), context);
}

org.apache.hadoop.mapreduce.task.ReduceContextImpl#nextKey

//hashMore 判断是否还有数据可以读取
//是否还有数据
if (hasMore) {
//开始读取下一行
return nextKeyValue();
} else {
//所有数据处理完成，reduce结束
return false;
}

org.apache.hadoop.mapreduce.task.ReduceContextImpl#nextKeyValue

//判断是否为key的第一个值
firstValue = !nextKeyIsSame;
//获取Key
key = keyDeserializer.deserialize(key);
value = valueDeserializer.deserialize(value);
//获取序列化时Key和value的长度
currentKeyLength = nextKey.getLength() - nextKey.getPosition();
currentValueLength = nextVal.getLength() - nextVal.getPosition();
//将数据写入到备份存储
if (isMarked) {
    backupStore.write(nextKey, nextVal);
}
//判断下一次是否可以继续读取数据
hasMore = input.next();
//如果后面还有数据，我要判断nextKeyIsSame
if (hasMore) {
//获取下个Key
nextKey = input.getKey();
//首先是组比较器，否则就是默认的比较器
nextKeyIsSame = comparator.compare(currentRawKey.getBytes(), 0,
currentRawKey.getLength(),
nextKey.getData(),
nextKey.getPosition(),
nextKey.getLength() -
nextKey.getPosition()
) == 0;
} else {
//如果读取不到数据，也就没有下一个了
nextKeyIsSame = false;
}

org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter#write

//以行的方式将数据写出
out.write(newline);

爱慕。

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
打赏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录