MR源码流程
--main
--job
--submit()
--connect();[new Cluster(getConfiguration());创建集群对象,本地模式或者yarn模式]
--submitter.submitJobInternal(Job.this, cluster);[向系统提交作业的内部方法]
--JobSubmitter
--checkSpecs(job);[检查输出规格]
--jConf.getNumReduceTasks() == 0 ?
jConf.getUseNewMapper() : jConf.getUseNewReducer()[如果reducerTask的个数为0,则没有reducer阶段]
--int maps = writeSplits(job, submitJobDir);
--InputFormat<?, ?> input =
ReflectionUtils.newInstance(job.getInputFormatClass(), conf);[为给定的类创建一个对象,并从conf初始化它]
--List<InputSplit> splits = input.getSplits(job);[生成文件列表并将其转换为文件集。切片]
--Arrays.sort(array, new SplitComparator());根据大小将分割的部分排序,以便最大的先走
--JobSplitWriter.createSplitFiles(jobSubmitDir, conf,
jobSubmitDir.getFileSystem(conf), array);[可以去目录下查看切片文件]
--conf.setInt(MRJobConfig.NUM_MAPS, maps);[设置MR的map job数]
--writeConf(conf, submitJobFile);[写入作业文件以提交目录,会在目录下产生xml文件,里面包含了此次job的各种参数]
--printTokens(jobId, job.getCredentials());[实际提交作业]
--JobState:RUNNING
--monitorAndPrintJob();[该方法结束后map阶段完成]
--map context.write(k, v);[读一行数据,根据map方法处理数据]
--MapTask
--init()
--keySerializer = serializationFactory.getSerializer(keyClass);[序列化]
valSerializer = serializationFactory.getSerializer(valClass);
--job.getCompressMapOutput();[压缩]
--write()
--collector.collect(key, value,
partitioner.getPartition(key, value, partitions));
--flush()
--spillLock.lock();[加锁开始溢写,其中需要序列化]
--Task
--progress()[指示需要发送进度更新]
--String report = (" map " + StringUtils.formatPercent(mapProgress(), 0)+ " reduce " + StringUtils.formatPercent(reduceProgress(), 0));[记录map和reduce的进度]
--LineReader
--readDefaultLine() str.clear();[清除之前数据,开始读下面的数据]
--RecordReader nextKeyValue()[读取接下来的数据]
--sortAndSpill()[排序溢写,在目录中可以看到.out文件产生]
--writer = new Writer<K, V>(job, partitionOut, keyClass, valClass, codec,
spilledRecordsCounter);[溢写到目录]
--combinerRunner.combine(kvIter, combineCollector);[合并]
--spillLock.unlock();[溢写完成,释放锁]
--mergeParts();[在合并之前释放排序缓冲区]
--Shuflle
--init()
--scheduler = new ShuffleSchedulerImpl[创建shuffle调度器]
--merger = createMergeManager(context);
--rIter = shuffleConsumerPlugin.run()[为减轻ApplicationMaster上的OOM问题,对每个RPC调用获取的最大事件进行伸缩]
--finalMerge()
--Merger.writeFile(rIter, writer, reporter, job);[归并,然后将数据写入本地磁盘,reducer开始]
--Collections.sort(diskSegments, new Comparator<Segment<K,V>>()
--diskSegments.addAll(0, memDiskSegments);
memDiskSegments.clear();
--OutputFormat
--getRecordWriter(TaskAttemptContext job)
--reducer context.write(key,v);[最后写入最终文件]
以wordcount为例
package mapreduce;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.mapreduce.Mapper;
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 获取一行
String line = value.toString();
// 切割
String[] words = line.split(" ");
// 输出
for (String word : words) {
k.set(word);
context.write(k, v);
}
}
}
package mapreduce;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
int sum;
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
// 累加求和
sum = 0;
for (IntWritable count : values) {
sum += count.get();
}
// 输出
v.set(sum);
context.write(key,v);
}
}
package mapreduce;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordcountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
args = new String[] { "input", "output" };
// 获取配置信息以及封装任务
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
// 设置jar加载路径
job.setJarByClass(WordcountDriver.class);
// 设置map和reduce类
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);
// 设置map输出
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置最终输出kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 提交
boolean result = job.waitForCompletion(true);
//为0时为正常退出程序,也就是结束当前正在运行中的java虚拟机。
//为非0的其他整数(包括负数,一般是1或者-1),表示非正常退出当前程序。
System.exit(result ? 0 : 1);
}
}