MapReducer编程之WordCount运行流程源码解读

MapReducer编程之WordCount运行流程源码解读

1、数据是如何读取进来的

查看源码Mapper.java 如下:

package org.apache.hadoop.mapreduce;

import java.io.IOException;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.task.MapContextImpl;

/** 
 * // Maps 输入一个key/value键值对,然后转换成一个临时的key/value
 * Maps input key/value pairs to a set of intermediate key/value pairs.  
 * 
 * // 第一个key/value键值对的含义是:key: 读取位置的offset    value:一行内容
 * // 第二个key/value表示mapper输出的结果,如:key: word    value: 1 
 * // Maps是一组独立的task,用来把我们输入的记录转成一个临时的记录,这个转换的记录并不一定和我们输入的
 * // 记录类型相同,输入的记录可以被映射成零个或者多个记录。(如果输入数据是空的,就是零个)
 * <p>Maps are the individual tasks which transform input records into a 
 * intermediate records. The transformed intermediate records need not be of 
 * the same type as the input records. A given input pair may map to zero or 
 * many output pairs.</p> 
 * 

 * @see InputFormat
 * @see JobContext
 * @see Partitioner  
 * @see Reducer
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {

  /**
   * The <code>Context</code> passed on to the {@link Mapper} implementations.
   */
  public abstract class Context
    implements MapContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {
  }
  
  /**
   * Called once at the beginning of the task.
   * 初始化方法(如:初始化数据库连接等)
   */
  protected void setup(Context context
                       ) throws IOException, InterruptedException {
    // NOTHING
  }

  /**
   * Called once for each key/value pair in the input split. Most applications
   * should override this, but the default is the identity function.
   * map方法
   */
  @SuppressWarnings("unchecked")
  protected void map(KEYIN key, VALUEIN value, 
                     Context context) throws IOException, InterruptedException {
    context.write((KEYOUT) key, (VALUEOUT) value);
  }

  /**
   * 清理、关闭资源等
   * Called once at the end of the task.
   */
  protected void cleanup(Context context
                         ) throws IOException, InterruptedException {
    // NOTHING
  }
  
  /**
   * run 启动 --》setup(context);--》map--》cleanup(context);
   * 典型的模板方法设计模式
   *        预先把整个执行流程定义好
   *        执行的时候就按照预先定义好的流程按顺序执行
   *        定义了生命周期: Mapper和Reducer都有的
   *            setup   做初始化的,比如获取connection、创建IO流
   *    		map/reduce
   *    		cleanup  做资源释放的,比如释放Connection、关闭IO流
   * Expert users can override this method for more complete control over the
   * execution of the Mapper.
   * @param context
   * @throws IOException
   */
  public void run(Context context) throws IOException, InterruptedException {
    setup(context);
    try {
      while (context.nextKeyValue()) {
        map(context.getCurrentKey(), context.getCurrentValue(), context);
      }
    } finally {
      cleanup(context);
    }
  }
}

通过本地Dubug查看数据读取(看源码不走debug就是耍流氓),假设我们读取文件中的内容是:

haohao,haohao
spark,spark
hadoop,hadoop

第一次读取 key=“0” 文件开头offset是0 value=“haohao,haohao”在这里插入图片描述
第二次读取 key=“15” value=“spark,spark”
文件中第一行内容加上回车换行符总共占14个字符,因此第二次读取时offset是15
在这里插入图片描述
第三次读取 key=“28” value=“hadoop,hadoop”
文件前两行总共占用27个字符,所以第三次从offset 28开始读取
在这里插入图片描述
到此Mapper读取数据,输出数据就完成了!通过Dubug一目了然!!!

2、作业是如何提交的(源码走读流程)

public void submit() 
       throws IOException, InterruptedException, ClassNotFoundException {
  ensureState(JobState.DEFINE);
  setUseNewAPI(); // 新老API的兼容
  connect(); // Local或者服务器
  final JobSubmitter submitter = // 后面提交作业使用的
      getJobSubmitter(cluster.getFileSystem(), cluster.getClient());
  status = ugi.doAs(new PrivilegedExceptionAction<JobStatus>() {
    public JobStatus run() throws IOException, InterruptedException, 
    ClassNotFoundException {
      return submitter.submitJobInternal(Job.this, cluster);
    }
  });
  state = JobState.RUNNING;
  LOG.info("The url to track the job: " + getTrackingURL());
 }
JobStatus submitJobInternal(Job job, Cluster cluster) 
throws ClassNotFoundException, InterruptedException, IOException {

  //validate the jobs output specs 
  checkSpecs(job); // 有没有配置输出,输出路径检查...

  Configuration conf = job.getConfiguration();
  // 把该作业的配置信息加到分布式缓存中
  addMRFrameworkToDistributedCache(conf);

  Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);
  //configure the command line options correctly on the submitting dfs
  InetAddress ip = InetAddress.getLocalHost();
  if (ip != null) {
    submitHostAddress = ip.getHostAddress();
    submitHostName = ip.getHostName();
    conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);
    conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);
  }
  // 拿到该Job对应的ID  local  application.....
  JobID jobId = submitClient.getNewJobID();
  job.setJobID(jobId);
  // jobStagingArea/jobid
  Path submitJobDir = new Path(jobStagingArea, jobId.toString());
  JobStatus status = null;
  try {
    conf.set(MRJobConfig.USER_NAME,
        UserGroupInformation.getCurrentUser().getShortUserName());
    conf.set("hadoop.http.filter.initializers", 
        "org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");
    conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString());
    LOG.debug("Configuring job " + jobId + " with " + submitJobDir 
        + " as the submit dir");
    // get delegation token for the dir
    TokenCache.obtainTokensForNamenodes(job.getCredentials(),
        new Path[] { submitJobDir }, conf);
    
    populateTokenCache(conf, job.getCredentials());

    // generate a secret to authenticate shuffle transfers
    if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {
      KeyGenerator keyGen;
      try {
        keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);
        keyGen.init(SHUFFLE_KEY_LENGTH);
      } catch (NoSuchAlgorithmException e) {
        throw new IOException("Error generating shuffle secret key", e);
      }
      SecretKey shuffleKey = keyGen.generateKey();
      TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),
          job.getCredentials());
    }
    if (CryptoUtils.isEncryptedSpillEnabled(conf)) {
      conf.setInt(MRJobConfig.MR_AM_MAX_ATTEMPTS, 1);
      LOG.warn("Max job attempts set to 1 since encrypted intermediate" +
              "data spill is enabled");
    }
    // 拷贝job对应的信息到jobStagingArea/jobid
    copyAndConfigureFiles(job, submitJobDir);

    Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);
    
    // Create the splits for the job
    LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));
      
    // ***** 完成我们输入数据的切片 数据是如何切片的??在这里 核心方法
    int maps = writeSplits(job, submitJobDir);
    conf.setInt(MRJobConfig.NUM_MAPS, maps);
    LOG.info("number of splits:" + maps);

    // write "queue admins of the queue to which job is being submitted"
    // to job file.
    String queue = conf.get(MRJobConfig.QUEUE_NAME,
        JobConf.DEFAULT_QUEUE_NAME);
    AccessControlList acl = submitClient.getQueueAdmins(queue);
    conf.set(toFullPropertyName(queue,
        QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());

    // removing jobtoken referrals before copying the jobconf to HDFS
    // as the tasks don't need this setting, actually they may break
    // because of it if present as the referral will point to a
    // different job.
    TokenCache.cleanUpTokenReferral(conf);

    if (conf.getBoolean(
        MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,
        MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {
      // Add HDFS tracking ids
      ArrayList<String> trackingIds = new ArrayList<String>();
      for (Token<? extends TokenIdentifier> t :
          job.getCredentials().getAllTokens()) {
        trackingIds.add(t.decodeIdentifier().getTrackingId());
      }
      conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,
          trackingIds.toArray(new String[trackingIds.size()]));
    }

    // Set reservation info if it exists
    ReservationId reservationId = job.getReservationId();
    if (reservationId != null) {
      conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString());
    }

    // Write job file to submit dir
    writeConf(conf, submitJobFile);
    
    //
    // Now, actually submit the job (using the submit name)
    //
    printTokens(jobId, job.getCredentials());
    // 提交我们的作业
    status = submitClient.submitJob(
        jobId, submitJobDir.toString(), job.getCredentials());
    if (status != null) {
      return status;
    } else {
      throw new IOException("Could not launch job");
    }
  } finally {
    if (status == null) {
      LOG.info("Cleaning up the staging area " + submitJobDir);
      if (jtFs != null && submitJobDir != null)
        jtFs.delete(submitJobDir, true);

    }
  }
}

下次我们来分析数据是如何切片的

发布了8 篇原创文章 · 获赞 0 · 访问量 106
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览