MapReduce整体源码解析:
前言:本文章分析的源码基于Hadoop 3.1.3,主要是针对MR流程过一遍源码,源码的详细解释只涉及一些重要部分,鉴于水平有限,免不了有些说明不够正确,欢迎各位大佬指正。
废话不多说,开搞!
job提交流程:
1. waitForCompletion()提交,进入submit方法,通过connnet方法获取当前job需要执行到本地还是Yarn
//1.waitForCompletion后,进入到job.java中,源码1587行
if (state == JobState.DEFINE) { //判断是否是定义的job
submit();
}
//2.进入submit
public void submit() //源码1560行
throws IOException, InterruptedException, ClassNotFoundException {
ensureState(JobState.DEFINE); //再次确认是否是定义的job
setUseNewAPI(); //使用新版API
connect();
//进入connect,步骤:
//return new Cluster(getConfiguration());
//初始化操作initialize(jobTrackAddr, conf); Cluster类中109行
//根据配置文件的参数信息,通过YarnClientProtocolProvider 获取到 LocalClientProtocolProvider
//获取当前job需要执行到本地还是Yarn
//最终:LocalClientProtocolProvider ==> LocalJobRunner
final JobSubmitter submitter =
getJobSubmitter(cluster.getFileSystem(), cluster.getClient());
status = ugi.doAs(new PrivilegedExceptionAction<JobStatus>() {
public JobStatus run() throws IOException, InterruptedException,
ClassNotFoundException {
return submitter.submitJobInternal(Job.this, cluster); //回到job.java,提交job, 1570行
}
});
2. return submitter.submitJobInternal(Job.this, cluster); 提交job
2.1先通过checkSpecs(job),jobSubmitter中143行; 检查job的输出路径。
// jobSubmitter中268行
private void checkSpecs(Job job) throws ClassNotFoundException,
InterruptedException, IOException {
JobConf jConf = (JobConf)job.getConfiguration();
// Check the output specification
if (jConf.getNumReduceTasks() == 0 ?
jConf.getUseNewMapper() : jConf.getUseNewReducer()) {
org.apache.hadoop.mapreduce.OutputFormat<?, ?> output =
ReflectionUtils.newInstance(job.getOutputFormatClass(),
job.getConfiguration());
output.checkOutputSpecs(job); //确认输出路径
} else {
jConf.getOutputFormat().checkOutputSpecs(jtFs, jConf);
}
}
//进入checkOutputSpecs(job),FileOutputFomat中151行
public void checkOutputSpecs(JobContext job
) throws FileAlreadyExistsException, IOException{
// Ensure that the output directory is set and not already there
Path outDir = getOutputPath(job);
if (outDir == null) { //假如输出路径没有设置,报错
throw new InvalidJobConfException("Output directory not set.");
}
// get delegation token for outDir's file system
TokenCache.obtainTokensForNamenodes(job.getCredentials(),
new Path[] { outDir }, job.getConfiguration());
if (outDir.getFileSystem(job.getConfiguration()).exists(outDir)) { //路径已存在,报错
throw new FileAlreadyExistsException("Output directory " + outDir +
" already exists");
}
}
2.2 生成Job提交的临时目录
Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf); //jobSubmitter中148行
file:/tmp/hadoop/mapred/staging/yong1727053970/.staging
2.3为当前Job生成Id
JobID jobId = submitClient.getNewJobID(); //jobSubmitter中157行
2.4确定Job的提交路径
Path submitJobDir = new Path(jobStagingArea, jobId.toString()); //jobSubmitter中159行,尚未创建
2.5创建job的提交路径
copyAndConfigureFiles(job, submitJobDir);//源码第194行
2.6生成切片信息 ,并返回切片的个数(重要,小心!)
int maps = writeSplits(job, submitJobDir); //源码第200行
//具体切片方法在FileInputFomat下的getSplits(), 389行。
public List<InputSplit> getSplits(JobContext job) throws IOException {
StopWatch sw = new StopWatch().start();
//getFormatMinSplitSize()返回1,getMinSplitSize(job)默认为0,可以设置参数修改
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));//即默认为1
//默认为Long.MAX_VALUE,很大
long maxSize = getMaxSplitSize(job);
if (isSplitable(job, path)) { //如果文件是可切分的, 414行
long blockSize = file.getBlockSize(); //块大小,集群模式下默认128M,本地模式下33554432B(32M)
long splitSize = computeSplitSize(blockSize, minSize, maxSize); //计算切片大小
}
//minsize默认为1,maxsize默认很大,blocksize默认为128M,则此方法默认返回128M
protected long computeSplitSize(long blockSize, long minSize, long maxSize) {
return Math.max(minSize, Math.min(maxSize, blockSize));
} //即,如果要增加块大小,可修改配置文件,增大minsize,反之可减小maxsize。
//418行
long bytesRemaining = length; //文件剩余大小
//如果说文件剩余大小 / 切片大小 > SPLIT_SLOP,才继续切块。SPLIT_SLOP为1.1
//啥意思呢,就是说如果文件被切了多次,剩余129M,此时块大小是128M的话,那这剩余的文件不会再进行切分,而是被整体当作一个块。这个设定是为了防止数据倾斜。
while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
splits.add(makeSplit(path, length-bytesRemaining, splitSize,
blkLocations[blkIndex].getHosts(),
blkLocations[blkIndex].getCachedHosts()));
bytesRemaining -= splitSize;
}
2.7通过切片的个数设置MapTask的个数
conf.setInt(MRJobConfig.NUM_MAPS, maps); //201行
2.8将当前Job相关的配置信息写到job提交路径下
writeConf(conf, submitJobFile); //245行, 生成job.xml
2.9真正提交Job
status = submitClient.submitJob(
jobId, submitJobDir.toString(), job.getCredentials()); //251行
//LocalJobRunner下788行
Job job = new Job(JobID.downgrade(jobid), jobSubmitDir); //构造真正执行的job,之前的job是以用户角度看的
2.10等job执行完成后,删除Job的临时工作目录的内容
jtFs.delete(submitJobDir, true); //262行
MapTask的工作机制
1.接Job提交流程2.9中,转到LocalJobRunnber的内部类Job 的run()方法
//537行,读取job.splitmetainfo
try {
TaskSplitMetaInfo[] taskSplitMetaInfos =
SplitMetaInfoReader.readSplitMetaInfo(jobId, localFs, conf, systemJobDir);
//获得reduceTask个数
int numReduceTasks = job.getNumReduceTasks();
2.根据切片的个数, 创建执行MapTask的 MapTaskRunnable(线程)
//547行
List<RunnableWithThrowable> mapRunnables = getMapTaskRunnables(
taskSplitMetaInfos, jobId, mapOutputFiles);
3.创建线程池
若是有多个切片,则需要多个maptask,就需要多个线程去执行
ExecutorService mapService = createMapExecutor();
4.执行 MapTaskRunnable
runTasks(mapRunnables, mapService, "map");
5.因为Runnable提交给线程池执行,接下来会执行MapTaskRunnable的run方法。
6.执行 LocalJobRunner的内部类Job的内部类MapTaskRunnable 中的run()方法.
//LocalJobRunner中254行
MapTask map = new MapTask(systemJobFile.toString(), mapId, taskId,
info.getSplitIndex(), 1); //创建MapTask对象
7.执行MapTask中的run方法
//271行
map.run(localConf, Job.this);
//run方法中执行runNewMapper, 347行
runNewMapper(job, splitMetaInfo, umbilical, reporter);
//进到runNewMapper
//创建一个context,MapTask中752行
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, getTaskID(), reporter);
//获取mapper,MapTask中757行
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
(org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
//创建inputFormat 761行
org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
(org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
//重新构建切片对象 765行
org.apache.hadoop.mapreduce.InputSplit split = null;
split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
splitIndex.getStartOffset());
LOG.info("Processing split: " + split);
//获得读取器RecordReader,770行
org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
new NewTrackingRecordReader<INKEY,INVALUE>
(split, inputFormat, reporter, taskContext);
//构造缓冲区对象,778行
if (job.getNumReduceTasks() == 0) {
output = //如果NumReduceTasks=0,则直接往本地文件写,不走shuffle流程
new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
} else { //不为0,则构建缓冲区
output = new NewOutputCollector(taskContext, job, umbilical, reporter);
}
//进入NewOutputCollector 710行
进入collector = createSortingCollector(job, reporter);
collector.init(context);//初始化缓冲区 408行, 进入init方法
final float spillper = //908行
job.getFloat(JobContext.MAP_SORT_SPILL_PERCENT, (float)0.8); //溢写百分比0.8
final int sortmb = job.getInt(MRJobConfig.IO_SORT_MB,
MRJobConfig.DEFAULT_IO_SORT_MB); //缓冲区大小100M
sorter = ReflectionUtils.newInstance(job.getClass( //994行
MRJobConfig.MAP_SORT_CLASS, QuickSort.class, //获取快速排序对象
IndexedSorter.class), job); //不改变元素位置,只根据索引排序
kvbuffer = new byte[maxMemUsage]; //kvbuffer创建 1000行
bufvoid = kvbuffer.length;
kvmeta = ByteBuffer.wrap(kvbuffer).order(ByteOrder.nativeOrder()).asIntBuffer(); //kvmeta
// k/v serialization kv序列化 1017行
// output counters 计数器 1027行
// compression 压缩 1034行
// combiner 1043行
------缓冲区构建完成--------
//缓冲区构建完成后,根据NumReduceTasks获得分区器
partitions = jobContext.getNumReduceTasks(); //711行
if (partitions > 1) {
partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)
ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
} else {
partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {
@Override
public int getPartition(K key, V value, int numPartitions) {
return partitions - 1;
}
};
// 执行WordCountMapper中的run方法。 实际执行的是WordCountMapper继承的Mapper中的run方法。 799行
mapper.run(mapperContext);
//进入在Mapper中的run方法中
public void run(Context context) throws IOException, InterruptedException {
setup(context);
try {
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context); //执行用户的map方法
}
//不断执行context.write(outK,outV),每次写出一对KV都会被缓冲区收集
Shuffle流程
1.进入context.write(outK,outV)
//WrapperMapper类下,是Mapper的包装类
mapContext.write(key, value); //112行
//再进入
output.write(key, value); //89行
//再进人 MapTask下,将map写出的kv 计算好分区后,收集到缓冲区中。
collector.collect(key, value, partitioner.getPartition(key, value, partitions)); //727行
//进入collect方法,若满足溢写条件,开始发生溢写
startSpill(); //Maptask下1126行
spillReady.signal(); //线程间的通信,通知溢写线程开始溢写 1602行
//溢写线程调用 sortAndSpill() 方法发生溢写操作
private void sortAndSpill() throws IOException, ClassNotFoundException, InterruptedException{//1609
//approximate the length of the output file to be the length of the
//buffer + header lengths for the partitions
final long size = distanceTo(bufstart, bufend, bufvoid) +
partitions * APPROX_HEADER_LENGTH;
FSDataOutputStream out = null;
FSDataOutputStream partitionOut = null;
try {
// create spill file
final SpillRecord spillRec = new SpillRecord(partitions); //根据分区创建溢写文件
final Path filename =
mapOutputFile.getSpillFileForWrite(numSpills, size);
out = rfs.create(filename);
}
}
//溢写前先进行排序 1625行
sorter.sort(MapOutputBuffer.this, mstart, mend, reporter);
// 进行溢写操作,1629行
for (int i = 0; i < partitions; ++i) { //根据分区溢写
IFile.Writer<K, V> writer = null;
try {
long segmentStart = out.getPos();
partitionOut = CryptoUtils.wrapIfNecessary(job, out, false);
writer = new Writer<K, V>(job, partitionOut, keyClass, valClass, codec,
spilledRecordsCounter);
if (combinerRunner == null) { //如果没有设置combiner,则直接溢写
// spill directly
DataInputBuffer key = new DataInputBuffer();
while (spindex < mend &&
kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) {
final int kvoff = offsetFor(spindex % maxRec);
int keystart = kvmeta.get(kvoff + KEYSTART);
int valstart = kvmeta.get(kvoff + VALSTART);
key.reset(kvbuffer, keystart, valstart - keystart);
getVBytesForOffset(kvoff, value);
writer.append(key, value);
++spindex;
}
} else {
int spstart = spindex;
while (spindex < mend &&
kvmeta.get(offsetFor(spindex % maxRec)
+ PARTITION) == i) {
++spindex;
}
}
}
}
//溢写完成,关闭
writer.close(); //1667行
//map持续往缓冲区写,达到溢写条件,就继续溢写 ........ 可能整个过程中发生N次溢写。
//假如上一次溢写完后,剩余进入的到缓冲区的数据没有达到溢写条件,那么当map中的所有的数据都已经处理完后,在关闭output时,会把缓冲区中的数据刷到磁盘中(其实就是没有达到溢写条件的数据也要写到磁盘)
output.close(mapperContext); //805行
//进入close,数据刷写,735行
collector.flush();
//进入flush,通过溢写的方法进行剩余数据的刷写 1505行
sortAndSpill();
//溢写完成后,进行归并 1527行
mergeParts();
for(int i = 0; i < numSpills; i++) {
filename[i] = mapOutputFile.getSpillFile(i);
finalOutFileSize += rfs.getFileStatus(filename[i]).getLen();
} //根据溢写的次数,得到要归并多少个溢写文件 //1852行
//生成最终存储数据的两个的文件(目前是空的)
Path finalOutputFile =
mapOutputFile.getOutputFileForWrite(finalOutFileSize); //内容
Path finalIndexFile =
mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize); //索引
//按照分区进行归并 1925行
for (int parts = 0; parts < partitions; parts++) {
//create the segments to be merged
List<Segment<K,V>> segmentList =
new ArrayList<Segment<K, V>>(numSpills);
}
//归并
RawKeyValueIterator kvIter = Merger.merge(job, rfs, //1950行
keyClass, valClass, codec,
segmentList, mergeFactor,
new Path(mapId.toString()),
job.getOutputKeyComparator(), reporter, sortSegments,
null, spilledRecordsCounter, sortPhase.phase(),
TaskType.MAP);
//将归并的文件写入磁盘 1959行
long segmentStart = finalOut.getPos();
finalPartitionOut = CryptoUtils.wrapIfNecessary(job, finalOut, false);
Writer<K, V> writer = //创建输出流
new Writer<K, V>(job, finalPartitionOut, keyClass, valClass, codec, spilledRecordsCounter);
if (combinerRunner == null || numSpills < minSpillsForCombine) { //如果combiner为空或者溢写次数少于
Merger.writeFile(kvIter, writer, reporter, job); //minSpillsForCombine(默认为3),则不走combine
} else {
combineCollector.setWriter(writer);
combinerRunner.combine(kvIter, combineCollector);
}
//归并完成,关流 1972
writer.close();
//删除溢写文件 1991
for(int i = 0; i < numSpills; i++) {
rfs.delete(filename[i],true);
}
//剩余file.out file.out.index,等待reduce的拷贝
ReduceTask机制
1.在LocalJobRunner的内部类Job中的run()方法中
try {
if (numReduceTasks > 0) {//根据设置的reduceTask的个数,创建对应个数 LocalJobRunner$Job$ReduceTaskRunnable
List<RunnableWithThrowable> reduceRunnables = getReduceTaskRunnables(
jobId, mapOutputFiles);
ExecutorService reduceService = createReduceExecutor();
unTasks(reduceRunnables, reduceService, "reduce");
}
}
//执行LocalJobRunner$Job$ReduceTaskRunnable 中的run方法
//创建ReduceTask对象 334行
ReduceTask reduce = new ReduceTask(systemJobFile.toString(),reduceId, taskId, mapIds.size(), 1);
// 执行ReduceTask的run方法 347行
reduce.run(localConf, Job.this);
//进入ReduceTask类
//run方法中,先设置步骤 324行
if (isMapOrReduce()) {
copyPhase = getProgress().addPhase("copy");
sortPhase = getProgress().addPhase("sort");
reducePhase = getProgress().addPhase("reduce");
}
//copy,sort过程省略吧,好累了
// 然后runNewReducer //390行
if (useNewApi) {
runNewReducer(job, umbilical, reporter, rIter, comparator, keyClass, valueClass);
}
//执行WordCountReducer的run方法 ,实际执行的是WordCountReducer继承的Reducer类中的run方法. 628行
reducer.run(reducerContext);
//进入run方法 171行
reduce(context.getCurrentKey(), context.getValues(), context); //终于执行到用户WordCountReducer中的 reduce方法.
context.write(k,v) //用户的reduce中将处理完的kv写出.
reduceContext.write(key, value); //105行
output.write(key, value); //89行
real.write(key,value); // 通过RecordWriter将kv写出 559行
out.write(NEWLINE); //通过输出流将数据写到结果文件中 101行
//终于结束了,写这些框架的人都是怎么样的天才!