MapReduce源码分析(一):总过程及Job提交过程

MapRedeuce源码分析

在这里插入图片描述
在这里插入图片描述

MapReduce总过程

(一)待处理的文本

(二)submit()提交前获取待处理信息,然后根据参数配置形成任务规划

(三)提交信息

(四)计算出MapTesk的数量

(五)MapTesk默认使用TextInputFormat读取文本信息。默认使用RecorderReader读取K和V

(六)使用Mapper根据需要读取K,V。Context.write(K,V)写出到收集器outputCollector中

(七)收集器将KV向环形缓冲区写入数据

(八)环形缓冲区满后,对其中的数据排序分区,

(九)溢出到磁盘中(分区内有序)

(十)将多个溢写文件Merge归并排序

(十一)之后可使用Combiner合并,再使用merge归并排序

(十二)MrappMaster所有MapTask任务完成后,启动相应数量的ReduceTask处理数据范围

(十三)Reduce主动将MapTask中归并过的数据按照分区copy到本地。并合并文件和排序

(十四)一次读取一组到Reduce中。

(十五)根据Kay分组

(十六)context.write写出。默认使用OutPutFormat中的TextOutPutFormat读取KV

Job提交流程

建立连接

​ 创建job提交代理

​ 分析本地代理或者集群代理

job提交

​ 创建集群提交路径stag

​ 生成JobId并在stag路径下创建jobId路径

​ (集群模式下)拷贝jar包到集群

​ 获取切片信息,生成切片规划文件

​ 获取配置信息,写XML配置文件

提交Job

job阶段源码

(一)待处理文本

(二)submit()前获取处理数据信息

if (state == JobState.DEFINE) {//判断状态,首次为
  submit();//提交!!(断点位置)
}

创建连接

public void submit() 
       throws IOException, InterruptedException, ClassNotFoundException {
  ensureState(JobState.DEFINE);
  setUseNewAPI();//把旧的API修改为新的API
  connect();//创建连接!!(断点位置)

获取集群对象

private synchronized void connect()
        throws IOException, InterruptedException, ClassNotFoundException {
  if (cluster == null) {//第一次集群为空
    cluster = 
      ugi.doAs(new PrivilegedExceptionAction<Cluster>() {
                 public Cluster run()
                        throws IOException, InterruptedException, 
                               ClassNotFoundException {
                   return new Cluster(getConfiguration());//获取配置文件后生成集群
                   //对象(断点位置)
public Cluster(Configuration conf) throws IOException {
    this(null, conf);
  }//调用下面双参构造器
public Cluster(InetSocketAddress jobTrackAddr, Configuration conf) 
    throws IOException {//双参为套接字地址和配置信息
  this.conf = conf;
  this.ugi = UserGroupInformation.getCurrentUser();
  initialize(jobTrackAddr, conf);//集群初始化.jobTrackAddr为null
}
private void initialize(InetSocketAddress jobTrackAddr, Configuration conf)
    throws IOException {

  initProviderList();//初始化提供者列表
private void initProviderList() {//第一次为null
  if (providerList == null) {
    synchronized (frameworkLoader) {
      if (providerList == null) {
        List<ClientProtocolProvider> localProviderList =
            new ArrayList<ClientProtocolProvider>();
        try {
            for (ClientProtocolProvider provider : frameworkLoader) {//框架加载器
              localProviderList.add(provider);//第一次:添加集群yarn;
            }//第二次:添加本地进列表。size为2
            //---------------------------省略多行代码
               providerList = localProviderList;//返回提供者列表
      }
    }
    //-->初始化完毕
private void initialize(InetSocketAddress jobTrackAddr, Configuration conf)
      throws IOException {

    initProviderList();
//--------------------省略多行代码-->
    for (ClientProtocolProvider provider : providerList) {//此时size为2
      LOG.debug("Trying ClientProtocolProvider : "//获取客户端协议提供者
                //1.首先获取一个Yarn客户端的名称.4.第二次获取本地客服端协议
          + provider.getClass().getName());
      ClientProtocol clientProtocol = null;
      try {//jobTrackAddr为null
        if (jobTrackAddr == null) {//2.获取yarn的配置信息-->3。5.获取本地配置信息
          clientProtocol = provider.create(conf);//6,获取到本地配置信息
        } else {
          clientProtocol = provider.create(jobTrackAddr, conf);
        }

        if (clientProtocol != null) {
          clientProtocolProvider = provider;//7.clientProtocolProvider为本地
          client = clientProtocol;
          LOG.debug("Picked " + provider.getClass().getName()
              + " as the ClientProtocolProvider");
          break;//8.退出。最终初始化成功,返回初始化为本地
        } else {//3.获取yarn配置信息失败
          LOG.debug("Cannot pick " + provider.getClass().getName()
              + " as the ClientProtocolProvider - returned null protocol");
        }
      } catch (Exception e) {
        final String errMsg = "Failed to use " + provider.getClass().getName()
            + " due to error: ";
        initEx.addSuppressed(new IOException(errMsg, e));
        LOG.info(errMsg, e);
      }
    }
public void submit() 
       throws IOException, InterruptedException, ClassNotFoundException {
  ensureState(JobState.DEFINE);
  setUseNewAPI();
  connect();
  final JobSubmitter submitter = 
      getJobSubmitter(cluster.getFileSystem(), cluster.getClient());
  status = ugi.doAs(new PrivilegedExceptionAction<JobStatus>() {
    public JobStatus run() throws IOException, InterruptedException, 
    ClassNotFoundException {
      return submitter.submitJobInternal(Job.this, cluster);//提交job的过程(断点位置)
    }
JobStatus submitJobInternal(Job job, Cluster cluster) 
throws ClassNotFoundException, InterruptedException, IOException {

  //validate the jobs output specs 验证作业输出规格
  checkSpecs(job);

  Configuration conf = job.getConfiguration();
  addMRFrameworkToDistributedCache(conf);//将MR框架添加到分布式缓存
	// 1)创建给集群提交数据的Stag路径(断点位置)
  Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);
  //configure the command line options correctly on the submitting dfs
  InetAddress ip = InetAddress.getLocalHost();
    if (ip != null) {
      submitHostAddress = ip.getHostAddress();
      submitHostName = ip.getHostName();
      conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);
      conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);
    }
    // 2)获取jobid ,并创建Job路径
    JobID jobId = submitClient.getNewJobID();
    job.setJobID(jobId);
    //获取提交的路径,将jobId放到Stag路径后
    Path submitJobDir = new Path(jobStagingArea, jobId.toString());
    JobStatus status = null;
    //-------------若干代码。
    // 3)拷贝jar包到集群
    copyAndConfigureFiles(job, submitJobDir);
    
private void copyAndConfigureFiles(Job job, Path jobSubmitDir) 
throws IOException {
  Configuration conf = job.getConfiguration();
  boolean useWildcards = conf.getBoolean(Job.USE_WILDCARD_FOR_LIBJARS,
      Job.DEFAULT_USE_WILDCARD_FOR_LIBJARS);
  JobResourceUploader rUploader = new JobResourceUploader(jtFs, useWildcards);
//上传资源
  rUploader.uploadResources(job, jobSubmitDir);
private int writeSplits(org.apache.hadoop.mapreduce.JobContext job,
    Path jobSubmitDir) throws IOException,
    InterruptedException, ClassNotFoundException {
    // 4)计算切片,生成切片规划文件
  JobConf jConf = (JobConf)job.getConfiguration();
  int maps;
  if (jConf.getUseNewMapper()) {
    maps = writeNewSplits(job, jobSubmitDir);
private <T extends InputSplit>
int writeNewSplits(JobContext job, Path jobSubmitDir) throws IOException,
    InterruptedException, ClassNotFoundException {
  Configuration conf = job.getConfiguration();
  InputFormat<?, ?> input =
    ReflectionUtils.newInstance(job.getInputFormatClass(), conf);

  List<InputSplit> splits = input.getSplits(job);
  T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);

  // sort the splits into order based on size, so that the biggest
  // go first
  Arrays.sort(array, new SplitComparator());
  JobSplitWriter.createSplitFiles(jobSubmitDir, conf, 
      jobSubmitDir.getFileSystem(conf), array);
  return array.length;
}
// Write job file to submit dir
// 5)向Stag路径写XML配置文件
writeConf(conf, submitJobFile);
private void writeConf(Configuration conf, Path jobFile) 
    throws IOException {
  // Write job file to JobTracker's fs        
  FSDataOutputStream out = 
    FileSystem.create(jtFs, jobFile, 
                      new FsPermission(JobSubmissionFiles.JOB_FILE_PERMISSION));
  try {
    conf.writeXml(out);
// 6)提交Job,返回提交状态
status = submitClient.submitJob(
    jobId, submitJobDir.toString(), job.getCredentials());
context.write(outK,outV);
public void write(KEYOUT key, VALUEOUT value) throws IOException,
    InterruptedException {
  mapContext.write(key, value);
}
//TaskInputOutputContextImpl
public void write(KEYOUT key, VALUEOUT value
                  ) throws IOException, InterruptedException {
  output.write(key, value);
}
//MapTask
public void write(K key, V value) throws IOException, InterruptedException {
  collector.collect(key, value,
                    partitioner.getPartition(key, value, partitions));
}
partitions = jobContext.getNumReduceTasks();//默认的reduceTask为1
if (partitions > 1) {
  partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)
    ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
} else {
  partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {
    @Override
    public int getPartition(K key, V value, int numPartitions) {
      return partitions - 1;//获取分区数
    }

//获取下一个数据

public void run(Context context) throws IOException, InterruptedException {
    setup(context);
    try {
      while (context.nextKeyValue()) {
        map(context.getCurrentKey(), context.getCurrentValue(), context);
      }
    } finally {
      cleanup(context);
    }
  }
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值