个人笔记之MapReduce
1.定义
首先,总结一下MapReduce是干嘛用的:
MapReduce是一种用于处理大批量数据的计算模型。
它的特点就是能把大量的数据分割进行计算,再将计算结果汇总出来。这样就能够用更低内存空间的处理器处理数据量远超该内存的数据–因为数据都被切割成很多小块了嘛,一台电脑没法算那就十台,一百台同时算,这样能大大节省计算时间,从而完成单台电脑根本不可能在短时间完成的任务。Map映射,Reduce规约。
核心思想:分而治之,计算向数据靠拢。
为什么要用MapReduce?它和其它工具比如MySQL的查询系统又有怎么样的区别?
在以前的计算模型中,每当处理数据的时候就要把需要计算的数据全部读取到内存中处理,最后将处理结果返回(归并例外)。这样处理的好处就是一切都在内存中进行,速度很快;但是坏处也很明显,内存的空间就那么大,一旦数据大小超过内存容量,很多算法就用不了了。
所以相对最简单的解决办法就是分割,将文件分割成许多小块,再将数据块传入Map节点中进行格式的转换,把原始数据转换成Map(K,V)格式的数据(便于整合计算),当然在这一阶段并不会计算,因为此时只是将文件转换成计算所需的数据格式而已,得到的结果并不是最终结果,计算的话毫无意义。然后在环形数据缓冲区中将数据溢写成多个小的数据文件到磁盘上,重点就在于这里的两个概念:分区和排序。首先每个数据会根据hash取模的方式得到一个分区,然后数据在溢写时会进行分区和快速排序,最终得到几个内部有序,外部无序的小文件。然后用归并算法对这些小文件进行排序。随后到了Reduce阶段,计算节点根据task拉取所需的分区数据,并对该数据实现计算,最终将计算所得的结果上传到hdfs集群上的指定路径。
2.MapReduce源码分析
(这一部分是copy的,便于查看和理解,主要是想快速理解思路。所以在此之前我看完了一个小DEMO的代码,再来看源码的话要稍微轻松一些)
先粘贴一段小demo的代码,用于对比学习:
这个小demo的功能是将个名为news.txt的英文新闻文件通过MapReduce来进行计数每个单词出现的次数并将结果返回到hdfs上。news.txt文件需要手动上传到hdfs上,MapReduce需要提前搭建好。
首先是计数工作的类(主类):
public class WordCountJob {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//获取配置文件
Configuration configuration = new Configuration(true);
//本地模式运行
configuration.set("mapreduce.framework.name", "local");
//创建任务
Job job = Job.getInstance(configuration);
//设置任务主类
job.setJarByClass(WordCountJob.class);
//设置任务
job.setJobName("cl-wordcount-" + System.currentTimeMillis());
//设置Reduce的数量
job.setNumReduceTasks(2);
//设置数据的输入路径
FileInputFormat.setInputPaths(job, new Path("/cl/news.txt"));
//设置数据的输出路径
FileOutputFormat.setOutputPath(job, new Path("/cl/result/wordcount_" + System.currentTimeMillis()));
//设置Map的输入的key和value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置Map和Reduce的处理类
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//提交任务
job.waitForCompletion(true);
}
}
然后是Map类,用于文件的分割和格式转换(数据行转换成KV格式,K为偏移量,V为数据行):
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
//创建对象
private IntWritable one = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//替换特殊字符
String valueString = value.toString();
valueString = valueString.replaceAll("[^a-zA-Z0-9'\\s]", "");
//切分字符串
String[] values = valueString.split(" ");
//向里面添加数据
for (String val : values) {
context.write(new Text(val), one);
}
}
}
最后是reduce类,用于计数(算):
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//获取迭代器
Iterator<IntWritable> iterator = values.iterator();
//声明一个计数器
int count = 0;
while (iterator.hasNext()) {
count += iterator.next().get();
}
//输出数据
context.write(key, new IntWritable(count));
}
}
带着疑问学习效率会比较高,这也是预习极为重要的原因之一,所以看完了demo再看源码会好一点(个人拙见)。
(以下为别人的劳动成果,拿来学习用)
Split(将文件切片,可大可小,灵活方便)
-
源代码的分析从提交任务开始
-
job.waitForCompletion(true);
-
-
org.apache.hadoop.mapreduce.Job#waitForCompletion
-
//Submit the job to the cluster and wait for it to finish. //将代码提交到集群并等待完成 //判断当前的状态 if (state == JobState.DEFINE) { //-------------------------------------------关键代码 submit(); } //监控任务的运行状态 if (verbose) { //Monitor a job and print status in real-time as progress is made and tasks fail. monitorAndPrintJob(); } //返回任务状态 return isSuccessful();
-
-
org.apache.hadoop.mapreduce.Job#submit
-
//确认当前任务的状态 ensureState(JobState.DEFINE); //mapreduce1.x和2.x,但是2的时候将1的好多方法进行了优化 setUseNewAPI(); //获取当前任务所运行的集群 connect(); //Provides a way to access information about the map/reduce cluster. cluster = new Cluster(getConfiguration()); //创建Job的提交器 final JobSubmitter submitter = getJobSubmitter(cluster.getFileSystem(), cluster.getClient()); //提交任务到系统去执行 //------------------------------------------------------------------------------关键代码 //Internal method for submitting jobs to the system status = submitter.submitJobInternal(Job.this, cluster) //任务的状态修改为运行 state = JobState.RUNNING;
-
-
org.apache.hadoop.mapreduce.JobSubmitter#submitJobInternal
-
//验证job输出 checkSpecs(job); //生成并设置新的JobId JobID jobId = submitClient.getNewJobID(); job.setJobID(jobId); //获取任务的提交目录 Path submitJobDir = new Path(jobStagingArea, jobId.toString()); //------------------------------------------------------------------------------关键代码 // Create the splits for the job 197行 int maps = writeSplits(job, submitJobDir); //设置map的数量,其中map的数量就等于切片的数量 conf.setInt(MRJobConfig.NUM_MAPS, maps);
-
-
org.apache.hadoop.mapreduce.JobSubmitter#writeSplits
-
//------------------------------------------------------------------------------关键代码 //使用新API maps = writeNewSplits(job, jobSubmitDir);
-
-
org.apache.hadoop.mapreduce.JobSubmitter#writeNewSplits
-
//获取配置文件 Configuration conf = job.getConfiguration(); //------------------------------------------------------------------------------关键代码InputFormat //获取文件读取器 org.apache.hadoop.mapreduce.lib.input.TextInputFormat InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); //------------------------------------------------------------------------------关键代码getSplits List<InputSplit> splits = input.getSplits(job); //将List转成数组 T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]); // sort the splits into order based on size, so that the biggest Arrays.sort(array, new SplitComparator()); //任务创建切片文件 JobSplitWriter.createSplitFiles(jobSubmitDir, conf,jobSubmitDir.getFileSystem(conf), array); //返回切片的数目 return array.length;
-
-
org.apache.hadoop.mapreduce.task.JobContextImpl#getInputFormatClass
-
//返回创建的TextInputFormat对象 return (Class<? extends InputFormat<?,?>>) conf.getClass(INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class); //getClass的操作是如果有值返回值,没有的话使用默认值 getClass(String name, Class<?> defaultValue)
-
-
org.apache.hadoop.mapreduce.lib.input.FileInputFormat#getSplits
-
public class TextInputFormat extends FileInputFormat<LongWritable, Text>
-
//Generate the list of files and make them into FileSplits. //Math.max(1,1) //getFormatMinSplitSize()一个切片最少应该拥有1个字节 //getMinSplitSize(job) 读取程序员设置的切片的最小值,如果没有设置默认读取1 long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); //读取程序员设置的切片的最大值,如果没有设置默认读取Long.MAX_VALUE long maxSize = getMaxSplitSize(job); //创建一个List存放切片 List<InputSplit> splits = new ArrayList<InputSplit>(); //获取要分析的文件列表 List<FileStatus> files = listStatus(job); //开始遍历要分析文件的路径 for (FileStatus file : files) { //获取文件路径 Path path = file.getPath(); //获取文件的长度,文件拥有的字节数 long length = file.getLen(); //如果文件长度不为0 if (length != 0) { //获取文件对应的Blocks信息 BlockLocation[] blkLocations; if (file instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) file).getBlockLocations(); } else { FileSystem fs = path.getFileSystem(job.getConfiguration()); blkLocations = fs.getFileBlockLocations(file, 0, length); } //判断文件是否可以进行切片 if (isSplitable(job, path)) { //获取Block的大小 long blockSize = file.getBlockSize(); //切片的默认大小为 128M //blockSize 128M, minSize 1byte, maxSize long.Max_ValueBytes //return Math.max(minSize, Math.min(maxSize, blockSize)); //minSize 64M ----> 128M //minSize 256M ----> 256M //maxSize 64M ----> 64M //maxSize 256M ---->128M long splitSize = computeSplitSize(blockSize, minSize, maxSize); //声明一个变量存放字节的余额 256M long bytesRemaining = length; //查看剩余的容量是否能达到阈值 SPLIT_SLOP //private static final double SPLIT_SLOP = 1.1 while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); //这个方法工厂专门用来创建切片 //切片生成之后添加到List //org.apache.hadoop.mapreduce.lib.input.FileInputFormat#makeSplit splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); //每次创建切片后,将使用的部分删除 bytesRemaining -= splitSize; } //判断剩余的容量是否为0 //最后一个切片的数据范围是(0,1 , 1.1] if (bytesRemaining != 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); } } else { // not splitable //如果发现文件不能切片,将整个文件作为一个切片 splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts())); } } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); //返回切片的容器 return splits;
-
MapTask
-
org.apache.hadoop.mapred.MapTask#run
-
//使用新的API boolean useNewApi = job.getUseNewMapper(); //------------------------------------------------------------------------------关键代码initialize //初始化MapTask initialize(job, getJobID(), reporter, useNewApi); //------------------------------------------------------------------------------关键代码runNewMapper //开始运行Task runNewMapper(job, splitMetaInfo, umbilical, reporter);
-
-
org.apache.hadoop.mapred.Task#initialize
-
//JOB的上下文参数 jobContext = new JobContextImpl(job, id, reporter); //Map的上下文参数 taskContext = new TaskAttemptContextImpl(job, taskId, reporter); //创建Map数据的写出器 outputFormat = ReflectionUtils.newInstance(taskContext.getOutputFormatClass(), job); //真正的写出对象 org.apache.hadoop.mapreduce.task.JobContextImpl#getOutputFormatClass return (Class<? extends OutputFormat<?,?>>) conf.getClass(OUTPUT_FORMAT_CLASS_ATTR, TextOutputFormat.class); //创建Map任务的提交器 committer = outputFormat.getOutputCommitter(taskContext); //真正的提交器对象 org.apache.hadoop.mapreduce.lib.output.FileOutputFormat#getOutputCommitter committer = new FileOutputCommitter(output, context); //获取写出的路径 Path outputPath = FileOutputFormat.getOutputPath(conf);
-
-
org.apache.hadoop.mapred.MapTask#runNewMapper
-
// make a task context so we can get the classes org.apache.hadoop.mapreduce.TaskAttemptContext taskContext = new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, getTaskID(),reporter); // make a mapper--com.lzj.wordcount.WordCountMapper org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper = (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>) ReflectionUtils.newInstance(taskContext.getMapperClass(), job); // make the input format--org.apache.hadoop.mapreduce.lib.input.TextInputFormat org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat = (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>) ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job); // rebuild the input split org.apache.hadoop.mapreduce.InputSplit split = null; split = getSplitDetails(new Path(splitIndex.getSplitLocation()),splitIndex.getStartOffset()); // 创建记录读取器 org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input = new NewTrackingRecordReader<INKEY,INVALUE>(split, inputFormat, reporter, taskContext); //创建真正的读取器 //org.apache.hadoop.mapred.MapTask.NewTrackingRecordReader#NewTrackingRecordReader this.real = inputFormat.createRecordReader(split, taskContext); //使用inputFormat创建读取器 //org.apache.hadoop.mapreduce.lib.input.TextInputFormat#createRecordReader return new LineRecordReader(recordDelimiterBytes); // 创建记录写出器 org.apache.hadoop.mapreduce.RecordWriter output = null; output = new NewOutputCollector(taskContext, job, umbilical, reporter); // 创建Map的上下文对象 org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> mapContext = new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), input, output, committer, reporter, split); // 创建mapContext的包装类 org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context mapperContext = new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext( mapContext); // 初始化切片信息 input.initialize(split, mapperContext); //开始执行Mapper方法,就是自己的Mapper实现类 mapper.run(mapperContext); mapPhase.complete(); setPhase(TaskStatus.Phase.SORT); statusUpdate(umbilical); //关闭输入 input.close(); input = null; //关闭输出(将缓冲区最后的数据写出,并合并这些文件) output.close(mapperContext); output = null;
-
-
org.apache.hadoop.mapred.MapTask.NewTrackingRecordReader#initialize
-
//LineRecordReader执行初始化 real.initialize(split, context);
-
-
org.apache.hadoop.mapreduce.lib.input.LineRecordReader#initialize
-
//获取切片 FileSplit split = (FileSplit) genericSplit; //配置信息 Configuration job = context.getConfiguration(); //一行最多读取的数据量 this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); //获取切片的开始和结束偏移量 start = split.getStart(); end = start + split.getLength(); //获取文件路径 final Path file = split.getPath(); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); //将读取器定位到切片的开始位置 fileIn.seek(start); //创建输入流 in = new UncompressedSplitLineReader(fileIn, job, this.recordDelimiterBytes, split.getLength()); filePosition = fileIn; // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start;
-
-
org.apache.hadoop.mapreduce.Mapper#run
-
//初始化 setup(context); try { //1判断是否为最后一行2为key设置值3为value设置值 while (context.nextKeyValue()) { //三个参数分别为:key value map(context.getCurrentKey(), context.getCurrentValue(), context); } } finally { //清空操作 cleanup(context); }
-
-
org.apache.hadoop.mapreduce.lib.input.LineRecordReader#nextKeyValue
-
//偏移量 key = new LongWritable(); //设置本次读取的开始位置 key.set(pos); //一行数据 value = new Text(); //We always read one extra line 读取一行的数据 newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos)); //下次读取数据的位置 pos += newSize;
-
KvBuffer
-
org.apache.hadoop.mapred.MapTask.NewOutputCollector#NewOutputCollector
-
//创建收集器 collector = createSortingCollector(job, reporter); //获取reduce的数量 partitions = jobContext.getNumReduceTasks(); if (partitions > 1) { partitioner = (org.apache.hadoop.mapreduce.Partitioner<K, V>) ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job); } else { partitioner = new org.apache.hadoop.mapreduce.Partitioner<K, V>() { @Override public int getPartition(K key, V value, int numPartitions) { return partitions - 1; } }; }
-
-
org.apache.hadoop.mapred.MapTask#createSortingCollector
-
//获取上下文对象 MapOutputCollector.Context context = new MapOutputCollector.Context(this, job, reporter); //获取收集器的Class Class<?>[] collectorClasses = job.getClasses( JobContext.MAP_OUTPUT_COLLECTOR_CLASS_ATTR, MapOutputBuffer.class); //获取MapOutputCollector的子类 Class<? extends MapOutputCollector> subclazz = clazz.asSubclass(MapOutputCollector.class); //通过反射创建一个收集器--org.apache.hadoop.mapred.MapTask.MapOutputBuffer MapOutputCollector<KEY, VALUE> collector = ReflectionUtils.newInstance(subclazz, job); //执行初始化操作 collector.init(context); //最终将创建的写出器返回 return collector;
-
-
org.apache.hadoop.mapred.MapTask.MapOutputBuffer#init
-
//获取溢写的阈值 final float spillper = job.getFloat(JobContext.MAP_SORT_SPILL_PERCENT, (float)0.8); //缓冲区数据的大小100M final int sortmb = job.getInt(JobContext.IO_SORT_MB, 100); //数据的大小 1024*1024 indexCacheMemoryLimit = job.getInt(JobContext.INDEX_CACHE_MEMORY_LIMIT, INDEX_CACHE_MEMORY_LIMIT_DEFAULT) //获取排序器--快速排序 sorter = ReflectionUtils.newInstance(job.getClass("map.sort.class", QuickSort.class, IndexedSorter.class), job); //设置容量 int maxMemUsage = sortmb << 20; //结果肯定是16的整数倍 maxMemUsage -= maxMemUsage % METASIZE; //缓冲区 kvbuffer = new byte[maxMemUsage]; //kvbuffer开始进行初始化 bufvoid = kvbuffer.length; kvmeta = ByteBuffer.wrap(kvbuffer).order(ByteOrder.nativeOrder()).asIntBuffer(); setEquator(0); bufstart = bufend = bufindex = equator; kvstart = kvend = kvindex; maxRec = kvmeta.capacity() / NMETA; softLimit = (int)(kvbuffer.length * spillper); bufferRemaining = softLimit; //获取比较器 comparator = job.getOutputKeyComparator(); //获取Map的key和value输出类型 keyClass = (Class<K>)job.getMapOutputKeyClass(); valClass = (Class<V>)job.getMapOutputValueClass(); //序列化Key和Value keySerializer = serializationFactory.getSerializer(keyClass); keySerializer.open(bb); valSerializer = serializationFactory.getSerializer(valClass); valSerializer.open(bb); //创建溢写线程,并让溢写线程处于等待,当达到阈值的时候开始溢写 spillInProgress = false; minSpillsForCombine = job.getInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3); spillThread.setDaemon(true); spillThread.setName("SpillThread"); spillLock.lock(); try { spillThread.start(); while (!spillThreadRunning) { spillDone.await(); } }
-
-
org.apache.hadoop.mapred.JobConf#getOutputKeyComparator
-
//获取比较器 Class<? extends RawComparator> ts = getClass(JobContext.KEY_COMPARATOR, null, RawComparator.class); //如果自定义了比较器,创建自定义比较器对象 if (ts != null) return ReflectionUtils.newInstance(ts, this); //如果没有创建比较器 return WritableComparator.get(getMapOutputKeyClass().asSubclass(WritableComparable.class), this); //默认的比较器对象--org.apache.hadoop.io.WritableComparator comparator = new WritableComparator(c, conf, true);
-
-
org.apache.hadoop.mapreduce.task.JobContextImpl#getPartitionerClass
-
//创建分区器 return (Class<? extends Partitioner<?,?>>) conf.getClass(PARTITIONER_CLASS_ATTR, HashPartitioner.class); //分区器具体执行的代码 //org.apache.hadoop.mapreduce.lib.partition.HashPartitioner#getPartition return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
-
Spill
-
org.apache.hadoop.mapred.MapTask.NewOutputCollector#write
-
//开始收集数据 collector.collect(key, value, partitioner.getPartition(key, value, partitions));
-
-
org.apache.hadoop.mapred.MapTask.MapOutputBuffer#collect
-
//元数据存储区 bufferRemaining -= METASIZE; //判断是否需要进行溢写,如果需要进行准备工作 //如果需要溢写唤醒SpillThread线程,调用run方法,开始SortAndSpill org.apache.hadoop.mapred.MapTask.MapOutputBuffer#sortAndSpill //如果不满足将数据存储到KvBuffer
-
Merge
-
org.apache.hadoop.mapred.MapTask.MapOutputBuffer#flush
-
//将缓冲区中不满80%的数据也写出到硬盘 sortAndSpill(); //合并曾经溢写出的数据块 mergeParts(); //当前Map准备好进入到下一个阶段 sortPhase.startNextPhase();
-
ReduceTask
-
org.apache.hadoop.mapred.ReduceTask#run
-
//进行初始化操作 initialize(job, getJobID(), reporter, useNewApi); //获取Key和Value的迭代器 RawKeyValueIterator rIter = null; //创建一个 Class combinerClass = conf.getCombinerClass(); CombineOutputCollector combineCollector = (null != combinerClass) ? new CombineOutputCollector(reduceCombineOutputCounter, reporter, conf) : null; //创建一个Shuffer Class<? extends ShuffleConsumerPlugin> clazz = job.getClass(MRConfig.SHUFFLE_CONSUMER_PLUGIN, Shuffle.class,ShuffleConsumerPlugin.class); shuffleConsumerPlugin = ReflectionUtils.newInstance(clazz, job); //创建一个上下文对象,并且对Shuffer进行初始化 ShuffleConsumerPlugin.Context shuffleContext = new ShuffleConsumerPlugin.Context(getTaskID(), job, FileSystem.getLocal(job), umbilical, super.lDirAlloc, reporter, codec, combinerClass, combineCollector, spilledRecordsCounter, reduceCombineInputCounter, shuffledMapsCounter, reduceShuffleBytes, failedShuffleCounter, mergedMapOutputsCounter, taskStatus, copyPhase, sortPhase, this, mapOutputFile, localMapFiles); //已经初始化了合并器 shuffleConsumerPlugin.init(shuffleContext); //执行Shuffer,并且返回key value的迭代器---MergeQueue rIter = shuffleConsumerPlugin.run(); //获取Key的输出类型 Class keyClass = job.getMapOutputKeyClass(); Class valueClass = job.getMapOutputValueClass(); //获取分组比较器(reduce阶段优先使用分组比较器,如果没有设置就使用原来的比较器) RawComparator comparator = job.getOutputValueGroupingComparator(); //开始执行Reduce任务 runNewReducer(job, umbilical, reporter, rIter, comparator,keyClass, valueClass);
-
-
org.apache.hadoop.mapred.Task#initialize
-
//获取Job和Reduce的Context jobContext = new JobContextImpl(job, id, reporter); taskContext = new TaskAttemptContextImpl(job, taskId, reporter); //返回数据的写出对象----org.apache.hadoop.mapreduce.lib.output.TextOutputFormat outputFormat = ReflectionUtils.newInstance(taskContext.getOutputFormatClass(), job); //创建提交对象----org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter committer = outputFormat.getOutputCommitter(taskContext); //获取数据输出的路径 Path outputPath = FileOutputFormat.getOutputPath(conf);
-
-
org.apache.hadoop.mapreduce.task.reduce.Shuffle#run
-
// Start the map-completion events fetcher thread final EventFetcher<K, V> eventFetcher = new EventFetcher<K, V>(reduceId, umbilical, scheduler, this,maxEventsToFetch); eventFetcher.start(); //判断map和reduce是否在一个节点 boolean isLocal = localMapFiles != null; //开启拉取的线程数,本地为1,其他节点为5 final int numFetchers = isLocal ? 1 :jobConf.getInt(MRJobConfig.SHUFFLE_PARALLEL_COPIES, 5); Fetcher<K, V>[] fetchers = new Fetcher[numFetchers]; //开始去拉取数据 fetchers[0].start(); //关闭拉取事件 eventFetcher.shutDown(); // Stop the map-output fetcher threads for (Fetcher<K, V> fetcher : fetchers) { fetcher.shutDown(); } //开始获取KeyValue的迭代器 RawKeyValueIterator kvIter = merger.close(); return kvIter;
-
-
org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl#close
-
//返回最后一次合并的数据的迭代器 return finalMerge(jobConf, rfs, memory, disk);
-
-
org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl#finalMerge
-
//获取map输出数据的类型 Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass(); Class<V> valueClass = (Class<V>)job.getMapOutputValueClass(); //获取一个比较器 ----org.apache.hadoop.io.WritableComparator final RawComparator<K> comparator = (RawComparator<K>)job.getOutputKeyComparator(); //返回key迭代器----org.apache.hadoop.mapred.Merger.MergeQueue final RawKeyValueIterator rIter = Merger.merge(job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, reporter, spilledRecordsCounter, null, mergePhase);
-
-
org.apache.hadoop.mapred.ReduceTask#runNewReducer
-
//获取迭代器 final RawKeyValueIterator rawIter = rIter; //使用匿名内部类创建一个新的对象 rIter = new RawKeyValueIterator() { public void close() throws IOException { rawIter.close(); } public DataInputBuffer getKey() throws IOException { return rawIter.getKey(); } public Progress getProgress() { return rawIter.getProgress(); } public DataInputBuffer getValue() throws IOException { return rawIter.getValue(); } public boolean next() throws IOException { boolean ret = rawIter.next(); reporter.setProgress(rawIter.getProgress().getProgress()); return ret; } }; //本次任务的上下文对象 org.apache.hadoop.mapreduce.TaskAttemptContext taskContext = new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,getTaskID(), reporter); //本次要执行的Reducer --com.lzj.WordCountReducer org.apache.hadoop.mapreduce.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE> reducer = (org.apache.hadoop.mapreduce.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils.newInstance(taskContext.getReducerClass(), job); //数据的写出器----org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter org.apache.hadoop.mapreduce.RecordWriter<OUTKEY, OUTVALUE> trackedRW = new NewTrackingRecordWriter<OUTKEY, OUTVALUE>(this, taskContext); //创建Reduce的上下文对象 org.apache.hadoop.mapreduce.Reducer.Context reducerContext = createReduceContext(reducer, job, getTaskID(), rIter, reduceInputKeyCounter, reduceInputValueCounter, trackedRW, committer, reporter, comparator, keyClass, valueClass); //开始执行reduce任务 reducer.run(reducerContext);
-
-
org.apache.hadoop.mapreduce.Reducer#run
-
//判断是否还有数据可以读取,相同的key只会执行一次(hello hello hello hi hi hi 2次) while (context.nextKey()) { //context.getValues()-->private ValueIterable iterable = new ValueIterable(); //values.iterator -->private ValueIterator iterator = new ValueIterator(); //iterator.hasNext -->return firstValue || nextKeyIsSame; //iterator.next-->(firstValue?value:nextKeyValue()) // reduce(context.getCurrentKey(), context.getValues(), context); }
-
-
org.apache.hadoop.mapreduce.task.ReduceContextImpl#nextKey
-
//hashMore 判断是否还有数据可以读取 //是否还有数据 if (hasMore) { //开始读取下一行 return nextKeyValue(); } else { //所有数据处理完成,reduce结束 return false; }
-
-
org.apache.hadoop.mapreduce.task.ReduceContextImpl#nextKeyValue
-
//判断是否为key的第一个值 firstValue = !nextKeyIsSame; //获取Key key = keyDeserializer.deserialize(key); value = valueDeserializer.deserialize(value); //获取序列化时Key和value的长度 currentKeyLength = nextKey.getLength() - nextKey.getPosition(); currentValueLength = nextVal.getLength() - nextVal.getPosition(); //将数据写入到备份存储 if (isMarked) { backupStore.write(nextKey, nextVal); } //判断下一次是否可以继续读取数据 hasMore = input.next(); //如果后面还有数据,我要判断nextKeyIsSame if (hasMore) { //获取下个Key nextKey = input.getKey(); //首先是组比较器,否则就是默认的比较器 nextKeyIsSame = comparator.compare(currentRawKey.getBytes(), 0, currentRawKey.getLength(), nextKey.getData(), nextKey.getPosition(), nextKey.getLength() - nextKey.getPosition() ) == 0; } else { //如果读取不到数据,也就没有下一个了 nextKeyIsSame = false; }
-
-
org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter#write
-
//以行的方式将数据写出 out.write(newline);
全剧终
-