MapRedeuce源码分析
MapReduce总过程
(一)待处理的文本
(二)submit()提交前获取待处理信息,然后根据参数配置形成任务规划
(三)提交信息
(四)计算出MapTesk的数量
(五)MapTesk默认使用TextInputFormat读取文本信息。默认使用RecorderReader读取K和V
(六)使用Mapper根据需要读取K,V。Context.write(K,V)写出到收集器outputCollector中
(七)收集器将KV向环形缓冲区写入数据
(八)环形缓冲区满后,对其中的数据排序分区,
(九)溢出到磁盘中(分区内有序)
(十)将多个溢写文件Merge归并排序
(十一)之后可使用Combiner合并,再使用merge归并排序
(十二)MrappMaster所有MapTask任务完成后,启动相应数量的ReduceTask处理数据范围
(十三)Reduce主动将MapTask中归并过的数据按照分区copy到本地。并合并文件和排序
(十四)一次读取一组到Reduce中。
(十五)根据Kay分组
(十六)context.write写出。默认使用OutPutFormat中的TextOutPutFormat读取KV
Job提交流程
建立连接
创建job提交代理
分析本地代理或者集群代理
job提交
创建集群提交路径stag
生成JobId并在stag路径下创建jobId路径
(集群模式下)拷贝jar包到集群
获取切片信息,生成切片规划文件
获取配置信息,写XML配置文件
提交Job
job阶段源码
(一)待处理文本
(二)submit()前获取处理数据信息
if (state == JobState.DEFINE) {//判断状态,首次为
submit();//提交!!(断点位置)
}
创建连接
public void submit()
throws IOException, InterruptedException, ClassNotFoundException {
ensureState(JobState.DEFINE);
setUseNewAPI();//把旧的API修改为新的API
connect();//创建连接!!(断点位置)
获取集群对象
private synchronized void connect()
throws IOException, InterruptedException, ClassNotFoundException {
if (cluster == null) {//第一次集群为空
cluster =
ugi.doAs(new PrivilegedExceptionAction<Cluster>() {
public Cluster run()
throws IOException, InterruptedException,
ClassNotFoundException {
return new Cluster(getConfiguration());//获取配置文件后生成集群
//对象(断点位置)
public Cluster(Configuration conf) throws IOException {
this(null, conf);
}//调用下面双参构造器
public Cluster(InetSocketAddress jobTrackAddr, Configuration conf)
throws IOException {//双参为套接字地址和配置信息
this.conf = conf;
this.ugi = UserGroupInformation.getCurrentUser();
initialize(jobTrackAddr, conf);//集群初始化.jobTrackAddr为null
}
private void initialize(InetSocketAddress jobTrackAddr, Configuration conf)
throws IOException {
initProviderList();//初始化提供者列表
private void initProviderList() {//第一次为null
if (providerList == null) {
synchronized (frameworkLoader) {
if (providerList == null) {
List<ClientProtocolProvider> localProviderList =
new ArrayList<ClientProtocolProvider>();
try {
for (ClientProtocolProvider provider : frameworkLoader) {//框架加载器
localProviderList.add(provider);//第一次:添加集群yarn;
}//第二次:添加本地进列表。size为2
//---------------------------省略多行代码
providerList = localProviderList;//返回提供者列表
}
}
//-->初始化完毕
private void initialize(InetSocketAddress jobTrackAddr, Configuration conf)
throws IOException {
initProviderList();
//--------------------省略多行代码-->
for (ClientProtocolProvider provider : providerList) {//此时size为2
LOG.debug("Trying ClientProtocolProvider : "//获取客户端协议提供者
//1.首先获取一个Yarn客户端的名称.4.第二次获取本地客服端协议
+ provider.getClass().getName());
ClientProtocol clientProtocol = null;
try {//jobTrackAddr为null
if (jobTrackAddr == null) {//2.获取yarn的配置信息-->3。5.获取本地配置信息
clientProtocol = provider.create(conf);//6,获取到本地配置信息
} else {
clientProtocol = provider.create(jobTrackAddr, conf);
}
if (clientProtocol != null) {
clientProtocolProvider = provider;//7.clientProtocolProvider为本地
client = clientProtocol;
LOG.debug("Picked " + provider.getClass().getName()
+ " as the ClientProtocolProvider");
break;//8.退出。最终初始化成功,返回初始化为本地
} else {//3.获取yarn配置信息失败
LOG.debug("Cannot pick " + provider.getClass().getName()
+ " as the ClientProtocolProvider - returned null protocol");
}
} catch (Exception e) {
final String errMsg = "Failed to use " + provider.getClass().getName()
+ " due to error: ";
initEx.addSuppressed(new IOException(errMsg, e));
LOG.info(errMsg, e);
}
}
public void submit()
throws IOException, InterruptedException, ClassNotFoundException {
ensureState(JobState.DEFINE);
setUseNewAPI();
connect();
final JobSubmitter submitter =
getJobSubmitter(cluster.getFileSystem(), cluster.getClient());
status = ugi.doAs(new PrivilegedExceptionAction<JobStatus>() {
public JobStatus run() throws IOException, InterruptedException,
ClassNotFoundException {
return submitter.submitJobInternal(Job.this, cluster);//提交job的过程(断点位置)
}
JobStatus submitJobInternal(Job job, Cluster cluster)
throws ClassNotFoundException, InterruptedException, IOException {
//validate the jobs output specs 验证作业输出规格
checkSpecs(job);
Configuration conf = job.getConfiguration();
addMRFrameworkToDistributedCache(conf);//将MR框架添加到分布式缓存
// 1)创建给集群提交数据的Stag路径(断点位置)
Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);
//configure the command line options correctly on the submitting dfs
InetAddress ip = InetAddress.getLocalHost();
if (ip != null) {
submitHostAddress = ip.getHostAddress();
submitHostName = ip.getHostName();
conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);
conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);
}
// 2)获取jobid ,并创建Job路径
JobID jobId = submitClient.getNewJobID();
job.setJobID(jobId);
//获取提交的路径,将jobId放到Stag路径后
Path submitJobDir = new Path(jobStagingArea, jobId.toString());
JobStatus status = null;
//-------------若干代码。
// 3)拷贝jar包到集群
copyAndConfigureFiles(job, submitJobDir);
private void copyAndConfigureFiles(Job job, Path jobSubmitDir)
throws IOException {
Configuration conf = job.getConfiguration();
boolean useWildcards = conf.getBoolean(Job.USE_WILDCARD_FOR_LIBJARS,
Job.DEFAULT_USE_WILDCARD_FOR_LIBJARS);
JobResourceUploader rUploader = new JobResourceUploader(jtFs, useWildcards);
//上传资源
rUploader.uploadResources(job, jobSubmitDir);
private int writeSplits(org.apache.hadoop.mapreduce.JobContext job,
Path jobSubmitDir) throws IOException,
InterruptedException, ClassNotFoundException {
// 4)计算切片,生成切片规划文件
JobConf jConf = (JobConf)job.getConfiguration();
int maps;
if (jConf.getUseNewMapper()) {
maps = writeNewSplits(job, jobSubmitDir);
private <T extends InputSplit>
int writeNewSplits(JobContext job, Path jobSubmitDir) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = job.getConfiguration();
InputFormat<?, ?> input =
ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
List<InputSplit> splits = input.getSplits(job);
T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);
// sort the splits into order based on size, so that the biggest
// go first
Arrays.sort(array, new SplitComparator());
JobSplitWriter.createSplitFiles(jobSubmitDir, conf,
jobSubmitDir.getFileSystem(conf), array);
return array.length;
}
// Write job file to submit dir
// 5)向Stag路径写XML配置文件
writeConf(conf, submitJobFile);
private void writeConf(Configuration conf, Path jobFile)
throws IOException {
// Write job file to JobTracker's fs
FSDataOutputStream out =
FileSystem.create(jtFs, jobFile,
new FsPermission(JobSubmissionFiles.JOB_FILE_PERMISSION));
try {
conf.writeXml(out);
// 6)提交Job,返回提交状态
status = submitClient.submitJob(
jobId, submitJobDir.toString(), job.getCredentials());
context.write(outK,outV);
public void write(KEYOUT key, VALUEOUT value) throws IOException,
InterruptedException {
mapContext.write(key, value);
}
//TaskInputOutputContextImpl
public void write(KEYOUT key, VALUEOUT value
) throws IOException, InterruptedException {
output.write(key, value);
}
//MapTask
public void write(K key, V value) throws IOException, InterruptedException {
collector.collect(key, value,
partitioner.getPartition(key, value, partitions));
}
partitions = jobContext.getNumReduceTasks();//默认的reduceTask为1
if (partitions > 1) {
partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)
ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
} else {
partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {
@Override
public int getPartition(K key, V value, int numPartitions) {
return partitions - 1;//获取分区数
}
//获取下一个数据
public void run(Context context) throws IOException, InterruptedException {
setup(context);
try {
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
} finally {
cleanup(context);
}
}