MapReduce源码流程

最新推荐文章于 2022-06-26 12:19:13 发布

一夜月色

最新推荐文章于 2022-06-26 12:19:13 发布

阅读量138

点赞数

文章标签： mapreduce hadoop 大数据

本文链接：https://blog.csdn.net/qq_46118470/article/details/105248829

版权

MR源码流程

--main
	--job
	--submit()
		--connect();[new Cluster(getConfiguration());创建集群对象，本地模式或者yarn模式]
		--submitter.submitJobInternal(Job.this, cluster);[向系统提交作业的内部方法]
			--JobSubmitter
			--checkSpecs(job);[检查输出规格]
				--jConf.getNumReduceTasks() == 0 ? 
        jConf.getUseNewMapper() : jConf.getUseNewReducer()[如果reducerTask的个数为0，则没有reducer阶段]
		--int maps = writeSplits(job, submitJobDir);
			--InputFormat<?, ?> input =
      ReflectionUtils.newInstance(job.getInputFormatClass(), conf);[为给定的类创建一个对象，并从conf初始化它]
		--List<InputSplit> splits = input.getSplits(job);[生成文件列表并将其转换为文件集。切片]
		--Arrays.sort(array, new SplitComparator());根据大小将分割的部分排序，以便最大的先走
		--JobSplitWriter.createSplitFiles(jobSubmitDir, conf, 
        jobSubmitDir.getFileSystem(conf), array);[可以去目录下查看切片文件]
        --conf.setInt(MRJobConfig.NUM_MAPS, maps);[设置MR的map job数]
        --writeConf(conf, submitJobFile);[写入作业文件以提交目录，会在目录下产生xml文件，里面包含了此次job的各种参数]
        --printTokens(jobId, job.getCredentials());[实际提交作业]
     --JobState:RUNNING
     --monitorAndPrintJob();[该方法结束后map阶段完成]
     	--map context.write(k, v);[读一行数据，根据map方法处理数据]
     		--MapTask
     		--init()
     			--keySerializer = serializationFactory.getSerializer(keyClass);[序列化]
      			  valSerializer = serializationFactory.getSerializer(valClass);
      			--job.getCompressMapOutput();[压缩]
     		--write()
     			--collector.collect(key, value,
                        partitioner.getPartition(key, value, partitions));
            --flush()
            --spillLock.lock();[加锁开始溢写，其中需要序列化]
                --Task
                --progress()[指示需要发送进度更新]
                --String report = (" map " + StringUtils.formatPercent(mapProgress(), 0)+ " reduce " + StringUtils.formatPercent(reduceProgress(), 0));[记录map和reduce的进度]
                    --LineReader
                    --readDefaultLine() str.clear();[清除之前数据，开始读下面的数据]
                    --RecordReader nextKeyValue()[读取接下来的数据]
      		 --sortAndSpill()[排序溢写，在目录中可以看到.out文件产生]
      		 	--writer = new Writer<K, V>(job, partitionOut, keyClass, valClass, codec,
                                      spilledRecordsCounter);[溢写到目录]
                --combinerRunner.combine(kvIter, combineCollector);[合并]
      		 --spillLock.unlock();[溢写完成，释放锁]
      		 --mergeParts();[在合并之前释放排序缓冲区]
     --Shuflle
     --init()
     	--scheduler = new ShuffleSchedulerImpl[创建shuffle调度器]
     	--merger = createMergeManager(context);
     --rIter = shuffleConsumerPlugin.run()[为减轻ApplicationMaster上的OOM问题，对每个RPC调用获取的最大事件进行伸缩]
     --finalMerge()
     	--Merger.writeFile(rIter, writer, reporter, job);[归并，然后将数据写入本地磁盘，reducer开始]
     		--Collections.sort(diskSegments, new Comparator<Segment<K,V>>()
     		--diskSegments.addAll(0, memDiskSegments);
      		  memDiskSegments.clear();
     --OutputFormat
     	--getRecordWriter(TaskAttemptContext job)
	 --reducer context.write(key,v);[最后写入最终文件]

以wordcount为例

package mapreduce;

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.mapreduce.Mapper;

public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

    Text k = new Text();
    IntWritable v = new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context)	throws IOException, InterruptedException {

        // 获取一行
        String line = value.toString();

        // 切割
        String[] words = line.split(" ");

        // 输出
        for (String word : words) {
            k.set(word);
            context.write(k, v);
        }
    }
}

package mapreduce;

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

    int sum;
    IntWritable v = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {

        // 累加求和
        sum = 0;
        for (IntWritable count : values) {
            sum += count.get();
        }

        // 输出
        v.set(sum);
        context.write(key,v);
    }
}

package mapreduce;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordcountDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        args = new String[] { "input", "output" };

        // 获取配置信息以及封装任务
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        // 设置jar加载路径
        job.setJarByClass(WordcountDriver.class);

        // 设置map和reduce类
        job.setMapperClass(WordcountMapper.class);
        job.setReducerClass(WordcountReducer.class);

        // 设置map输出
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 设置最终输出kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 提交
        boolean result = job.waitForCompletion(true);

        //为0时为正常退出程序，也就是结束当前正在运行中的java虚拟机。
        //为非0的其他整数（包括负数，一般是1或者-1），表示非正常退出当前程序。
        System.exit(result ? 0 : 1);
    }
}