2.4 MapReduce源码分析02

最新推荐文章于 2022-09-06 14:11:23 发布

BF-LoneSilverWind

最新推荐文章于 2022-09-06 14:11:23 发布

阅读量160

点赞数

分类专栏： BigData - Hadoop

本文链接：https://blog.csdn.net/digua930126/article/details/103638382

版权

BigData - Hadoop 专栏收录该内容

16 篇文章 0 订阅

订阅专栏

Map的源码分析

//make a task context so we can get the classes
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
             new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,getTaskID(),reporter);
// make a mapper  -------> com.xxxxx.gy.WordCountMapper
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
        (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
        			ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
// make the input format 
//-------> org.apache.hadoop.mapreduce.lib.input.TextInputFormat
//-------> org.apache.hadoop.mapreduce.lib.input.FileInputFormat
org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
    	(org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
    				ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
// rebuild the input split
// 获取当前切片对象：切片的位置，切片的偏移量
org.apache.hadoop.mapreduce.InputSplit split = null;
split = getSplitDetails(new Path(splitIndex.getSplitLocation()),splitIndex.getStartOffset());

//创建一个记录读取器
org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
             new NewTrackingRecordReader<INKEY,INVALUE>(split, inputFormat, reporter, taskContext);
	//org.apache.hadoop.mapreduce.RecordReader
	//org.apache.hadoop.mapreduce.lib.input.LineRecordReader
	this.real = inputFormat.createRecordReader(split, taskContext);
		return new LineRecordReader(recordDelimiterBytes);
//创建一个记录写出器--创建了一个新型的数据收集器
org.apache.hadoop.mapreduce.RecordWriter output = null;
	output = new NewOutputCollector(taskContext, job, umbilical, reporter);
	{
      	//创建一个会排序的收集器
        collector = createSortingCollector(job, reporter);
        {
            //获取收集器类 org.apache.hadoop.mapred.MapTask$MapOutputBuffer.class
            Class<?>[] collectorClasses = 
                job.getClasses(JobContext.MAP_OUTPUT_COLLECTOR_CLASS_ATTR, MapOutputBuffer.class);
            //创建MapOutputBuffer对象
            MapOutputCollector<KEY, VALUE> collector =
                ReflectionUtils.newInstance(subclazz, job);
            //初始化
            collector.init(context);
            {
                //溢写的阈值
                final float spillper = job.getFloat(JobContext.MAP_SORT_SPILL_PERCENT, (float)0.8);
                //kvbuffer的大小为100
                final int sortmb = job.getInt(JobContext.IO_SORT_MB, 100);
                //创建排序器
                sorter = ReflectionUtils.newInstance(job.getClass("map.sort.class",
            			QuickSort.class, IndexedSorter.class), job);
                //最大使用的内存100M
                int maxMemUsage = sortmb << 20;
                //我们的内存使用肯定是16的整数倍
                maxMemUsage -= maxMemUsage % METASIZE;
                //创建字节数据(环形数据缓冲区)
                kvbuffer = new byte[maxMemUsage];
                //获取比较器
                comparator = job.getOutputKeyComparator();
                //设置溢写线程为守护进程
                spillThread.setDaemon(true);
                spillThread.setName("SpillThread");
                //启动溢写线程
                spillThread.start();
                //开始等待别人的唤醒
                spillDone.await();
            }
        }
        //分区的数目就等于reduce的数目
        partitions = jobContext.getNumReduceTasks();
        //获取分区器
        if (partitions > 1) {
            //默认的分区器---》org.apache.hadoop.mapreduce.lib.partition.HashPartitioner
            partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)
                ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
            		//获取分区的业务逻辑
                    public int getPartition(K key, V value,int numReduceTasks) {
                        //计算hash然后对分区数进行取余
                        return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
                    }
        } else {
            partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {
                @Override
                public int getPartition(K key, V value, int numPartitions) {
                    return partitions - 1;
                }
            };
        }
    }
	
//MapContext当前map的上下文对象,将刚才创建的对象都整合到一起
org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> 
    mapContext = new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), 
          input, output, 
          committer, 
          reporter, split);
//MapContext的包装类
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context 
        mapperContext = new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
              mapContext);

//在wordcountmapper中写一行代码
context.write(new Text(val), one);
//最终调用的是收集器的代码
collector.collect(key, value,partitioner.getPartition(key, value, partitions));
{
    //如果没有足够空间，开启溢写线程
    if (bufferRemaining <= 0) {
        startSpill();
    }
}

//关闭输出流
output.close(mapperContext);
{
    //将数据刷出
    collector.flush();
        //将环形数据缓冲区最后的数据溢写到硬盘
        sortAndSpill();
    	//合并溢写块
    	mergeParts();
    	{
            //创建溢写文件地址的数据
            final Path[] filename = new Path[numSpills];
            //遍历所有的溢写文件，然后将其地址存放到数组
            for(int i = 0; i < numSpills; i++) {
                filename[i] = mapOutputFile.getSpillFile(i);
                finalOutFileSize += rfs.getFileStatus(filename[i]).getLen();
            }
        }
    //关闭
    collector.close();
}

Reduce的源码分析

//------------------------ReduceTask.run()
//三个阶段
if (isMapOrReduce()) {
    copyPhase = getProgress().addPhase("copy");
    sortPhase  = getProgress().addPhase("sort");
    reducePhase = getProgress().addPhase("reduce");
}

//获取洗牌实例Class  org.apache.hadoop.mapreduce.task.reduce.Shuffle.class
Class<? extends ShuffleConsumerPlugin> clazz =
          job.getClass(MRConfig.SHUFFLE_CONSUMER_PLUGIN, Shuffle.class, ShuffleConsumerPlugin.class);
//实例化对象
ShuffleConsumerPlugin shuffleConsumerPlugin = null;
shuffleConsumerPlugin = ReflectionUtils.newInstance(clazz, job);

//创建shuffleContext
ShuffleConsumerPlugin.Context shuffleContext = 
      new ShuffleConsumerPlugin.Context(getTaskID(), job, FileSystem.getLocal(job), umbilical, 
                  super.lDirAlloc, reporter, codec, 
                  combinerClass, combineCollector, 
                  spilledRecordsCounter, reduceCombineInputCounter,
                  shuffledMapsCounter,
                  reduceShuffleBytes, failedShuffleCounter,
                  mergedMapOutputsCounter,
                  taskStatus, copyPhase, sortPhase, this,
                  mapOutputFile, localMapFiles);
//初始化
shuffleConsumerPlugin.init(shuffleContext);
//创建迭代器（未加工的）
RawKeyValueIterator rIter = null;
rIter = shuffleConsumerPlugin.run();
{
   	//判断reduce和map是否在一个节点
    boolean isLocal = localMapFiles != null;
    //获取fecher的数量
    final int numFetchers = isLocal ? 1 : jobConf.getInt(MRJobConfig.SHUFFLE_PARALLEL_COPIES, 5);
    //开始创建fetchers
    Fetcher<K,V>[] fetchers = new Fetcher[numFetchers];
    //创建Fetcher
    for (int i=0; i < numFetchers; ++i) {
        fetchers[i] = 
            new Fetcher<K,V>(jobConf, reduceId, scheduler, merger, reporter, metrics, this, 
                                       reduceTask.getShuffleSecret());
        	//启动拉取线程
       		fetchers[i].start();
        	{
                //开始拷贝
                copyFromHost(host);
            }
    }
    // Finish the on-going merges...
    RawKeyValueIterator kvIter = null;
    //org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl
    kvIter = merger.close();
    {
        //最终合并
        return finalMerge(jobConf, rfs, memory, disk);
        {
            return Merger.merge(job, fs, keyClass, valueClass,finalSegments, finalSegments.size(), 						tmpDir,comparator, reporter, spilledRecordsCounter, null,null);
            {
                return new MergeQueue<K, V>();
            }
        }
    }
    return kvIter;
}
//将硬盘上拉取的数据释放--此时合并已经完成
mapOutputFilesOnDisk.clear();
//获取Map输出值的key和value
Class keyClass = job.getMapOutputKeyClass();
Class valueClass = job.getMapOutputValueClass();

//------------------------ReduceTask.runNewReducer()
//其实就是MergeQueue
final RawKeyValueIterator rawIter = rIter;
//创建了一个接口对象
rIter = new RawKeyValueIterator();

//创建task上下文对象
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
      new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,getTaskID(), reporter);
//实例化Reducer对象--->com.shsxt.ly.WordCountReducer
org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE> reducer =
      (org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE>)
        	ReflectionUtils.newInstance(taskContext.getReducerClass(), job);
//记录的写出器
org.apache.hadoop.mapreduce.RecordWriter<OUTKEY,OUTVALUE> trackedRW = 
      new NewTrackingRecordWriter<OUTKEY, OUTVALUE>(this, taskContext);
//创建Reduce的上下文对象
org.apache.hadoop.mapreduce.Reducer.Context 
         reducerContext = createReduceContext(reducer, job, getTaskID(),
                                               rIter, reduceInputKeyCounter, 
                                               reduceInputValueCounter, 
                                               trackedRW,
                                               committer,
                                               reporter, comparator, keyClass,
//开启自己Reducer                                              valueClass);
reducer.run(reducerContext);
{
 	//查询有无下一个数据
    while (context.nextKey()) {
        context.write(key, new IntWritable(count));
        
    }
    //开始和结束的时候调用一次
    setup(context);
    cleanup(context);
}

//-------------------------------context.nextKey()
while (hasMore && nextKeyIsSame) {
    nextKeyValue();
}
if (hasMore) {
    if (inputKeyCounter != null) {
        inputKeyCounter.increment(1);
    }
    return nextKeyValue();
    {
        //如果没有下一个数据，直接返回false
        if (!hasMore) {
          key = null;
          value = null;
          return false;
        }
        
        firstValue = !nextKeyIsSame;
        //获取下一个key和value
        DataInputBuffer nextKey = input.getKey();
        DataInputBuffer nextVal = input.getValue();
        //其实就是Nextkey的值
        currentRawKey.set(nextKey.getData(), nextKey.getPosition(), 
                      nextKey.getLength() - nextKey.getPosition());
        
        //获取当前key和value的长度
        currentKeyLength = nextKey.getLength() - nextKey.getPosition();
    	currentValueLength = nextVal.getLength() - nextVal.getPosition();
        //重新判断是否有下一个
        hasMore = input.next();
		if (hasMore) {
            nextKey = input.getKey();
      		nextKeyIsSame = comparator.compare(currentRawKey.getBytes(), 0, 
                                     currentRawKey.getLength(),
                                     nextKey.getData(),
                                     nextKey.getPosition(),
                                     nextKey.getLength() - nextKey.getPosition()) == 0;
        }
    }
} else {
    return false;
}

BF-LoneSilverWind

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
2.4 MapReduce源码分析02

Map的源码分析//make a task context so we can get the classesorg.apache.hadoop.mapreduce.TaskAttemptContext taskContext = new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,getT...
复制链接

扫一扫

专栏目录