Hadoop源码分析-Context

最新推荐文章于 2021-09-30 13:47:02 发布

judyge

最新推荐文章于 2021-09-30 13:47:02 发布

阅读量1k

点赞数 1

分类专栏：云计算

云计算专栏收录该内容

89 篇文章 1 订阅

订阅专栏

学编程第一个肯定是hello world，Hadoop也不例外，它的hello world就是Wordcount，单词统计例子

 
    
  
package org.apache.hadoop.examples;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {

 public static class TokenizerMapper 
      extends Mapper<Object, Text, Text, IntWritable>{
   
   private final static IntWritable one = new IntWritable(1);
   private Text word = new Text();
     
   public void map(Object key, Text value, Context context
                   ) throws IOException, InterruptedException {
     StringTokenizer itr = new StringTokenizer(value.toString());
     while (itr.hasMoreTokens()) {
       word.set(itr.nextToken());
       context.write(word, one);
     }
   }
 }
 
 public static class IntSumReducer 
      extends Reducer<Text,IntWritable,Text,IntWritable> {
   private IntWritable result = new IntWritable();

   public void reduce(Text key, Iterable<IntWritable> values, 
                      Context context
                      ) throws IOException, InterruptedException {
     int sum = 0;
     for (IntWritable val : values) {
       sum += val.get();
     }
     result.set(sum);
     context.write(key, result);
   }
 }

 public static void main(String[] args) throws Exception {
   Configuration conf = new Configuration();
   String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
   if (otherArgs.length != 2) {
     System.err.println("Usage: wordcount <in> <out>");
     System.exit(2);
   }
   Job job = new Job(conf, "word count");
   job.setJarByClass(WordCount.class);
   job.setMapperClass(TokenizerMapper.class);
   job.setCombinerClass(IntSumReducer.class);
   job.setReducerClass(IntSumReducer.class);
   job.setOutputKeyClass(Text.class);
   job.setOutputValueClass(IntWritable.class);
   FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
   FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
   System.exit(job.waitForCompletion(true) ? 0 : 1);
 }
} 
    
  

　　在Mapper中的map、以及Reducer中的reduce都有一个Context的类型

 1 public void map(Object key, Text value, Context context)
 2                 throws OException,InterruptedException{
 3     StringTokenizer itr = new StringTokenizer(value.toString());
 4     while (itr.hasMoreTokens()) {
 5         word.set(itr.nextToken());
 6         context.write(word, one);
 7     }
 8 }
 9 
10 public void reduce(Text key, Iterable<IntWritable> values,Context context)    
11                    throws IOException, InterruptedException {
12     int sum = 0;
13     for (IntWritable val : values) {
14         sum += val.get();
15     }
16     result.set(sum);
17     context.write(key, result);
18 }

　　这个Context究竟有何作用呢，按照翻译，它就是一个“上下文”，再由map中的

context.write(word, one);

以及reduce中的

context.write(key, result);

可以了解到，context应该是用来传递数据以及其他运行状态信息，map中的key、value写入context，让它传递给Reducer进行reduce，而reduce进行处理之后数据继续写入context，继续交给Hadoop写入hdfs系统。

　　那么Context究竟是怎样的呢。看一下它的继承实现结构。虽然Mapper与Reducer中都有一个Context类，但是它们并不是完全一样的。看一下Mapper与Reducer的源码。

 
    
  
public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {

 public class Context 
   extends MapContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {
   public Context(Configuration conf, TaskAttemptID taskid,
                  RecordReader<KEYIN,VALUEIN> reader,
                  RecordWriter<KEYOUT,VALUEOUT> writer,
                  OutputCommitter committer,
                  StatusReporter reporter,
                  InputSplit split) throws IOException, InterruptedException {
     super(conf, taskid, reader, writer, committer, reporter, split);
   }
 }
 
 /**
  * Called once at the beginning of the task.
  */
 protected void setup(Context context
                      ) throws IOException, InterruptedException {
   // NOTHING
 }

 /**
  * Called once for each key/value pair in the input split. Most applications
  * should override this, but the default is the identity function.
  */
 @SuppressWarnings("unchecked")
 protected void map(KEYIN key, VALUEIN value, 
                    Context context) throws IOException, InterruptedException {
   context.write((KEYOUT) key, (VALUEOUT) value);
 }

 /**
  * Called once at the end of the task.
  */
 protected void cleanup(Context context
                        ) throws IOException, InterruptedException {
   // NOTHING
 }
 
 /**
  * Expert users can override this method for more complete control over the
  * execution of the Mapper.
  * @param context
  * @throws IOException
  */
 public void run(Context context) throws IOException, InterruptedException {
   setup(context);
   try {
     while (context.nextKeyValue()) {
       map(context.getCurrentKey(), context.getCurrentValue(), context);
     }
   } finally {
     cleanup(context);
   }
 }
} 
    
  

 
   
 Reducer 

可以看到原来Mapper与Reducer两个Context都是内部类的，Mapper的Context是通过继承MapContext，而Reducer的Context则是通过继承ReduceContext。

 
    
  
public class Context 
   extends MapContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {
   public Context(Configuration conf, TaskAttemptID taskid,
                  RecordReader<KEYIN,VALUEIN> reader,
                  RecordWriter<KEYOUT,VALUEOUT> writer,
                  OutputCommitter committer,
                  StatusReporter reporter,
                  InputSplit split) throws IOException, InterruptedException {
     super(conf, taskid, reader, writer, committer, reporter, split);
   }
} 
    
  

 
    
  
public class Context 
   extends ReduceContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {
   public Context(Configuration conf, TaskAttemptID taskid,
                  RawKeyValueIterator input, 
                  Counter inputKeyCounter,
                  Counter inputValueCounter,
                  RecordWriter<KEYOUT,VALUEOUT> output,
                  OutputCommitter committer,
                  StatusReporter reporter,
                  RawComparator<KEYIN> comparator,
                  Class<KEYIN> keyClass,
                  Class<VALUEIN> valueClass
                  ) throws IOException, InterruptedException {
     super(conf, taskid, input, inputKeyCounter, inputValueCounter,
           output, committer, reporter, 
           comparator, keyClass, valueClass);
   }
} 
    
  

在Mapper.Context与Reducer.Context与继承前对比，没有增加成员以及方法，也没有重写方法，单纯把MapContext、ReduceContext重新封装，所以目标就是分析MapContext与ReduceContext

 
    
  
package org.apache.hadoop.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;


public class MapContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> 
 extends TaskInputOutputContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {
 private RecordReader<KEYIN,VALUEIN> reader;
 private InputSplit split;

 public MapContext(Configuration conf, TaskAttemptID taskid,
                   RecordReader<KEYIN,VALUEIN> reader,
                   RecordWriter<KEYOUT,VALUEOUT> writer,
                   OutputCommitter committer,
                   StatusReporter reporter,
                   InputSplit split) {
   super(conf, taskid, writer, committer, reporter);
   this.reader = reader;
   this.split = split;
 }

 
 public InputSplit getInputSplit() {
   return split;
 }

 public KEYIN getCurrentKey() throws IOException, InterruptedException {
   return reader.getCurrentKey();
 }

 public VALUEIN getCurrentValue() throws IOException, InterruptedException {
   return reader.getCurrentValue();
 }

 public boolean nextKeyValue() throws IOException, InterruptedException {
   return reader.nextKeyValue();
 }

}


package org.apache.hadoop.mapreduce;

import java.io.IOException;
import java.util.Iterator;
import java.util.NoSuchElementException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.mapred.RawKeyValueIterator;
import org.apache.hadoop.util.Progressable;


public class ReduceContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
   extends TaskInputOutputContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {
 private RawKeyValueIterator input;
 private Counter inputKeyCounter;
 private Counter inputValueCounter;
 private RawComparator<KEYIN> comparator;
 private KEYIN key;                                  // current key
 private VALUEIN value;                              // current value
 private boolean firstValue = false;                 // first value in key
 private boolean nextKeyIsSame = false;              // more w/ this key
 private boolean hasMore;                            // more in file
 protected Progressable reporter;
 private Deserializer<KEYIN> keyDeserializer;
 private Deserializer<VALUEIN> valueDeserializer;
 private DataInputBuffer buffer = new DataInputBuffer();
 private BytesWritable currentRawKey = new BytesWritable();
 private ValueIterable iterable = new ValueIterable();

 public ReduceContext(Configuration conf, TaskAttemptID taskid,
                      RawKeyValueIterator input, 
                      Counter inputKeyCounter,
                      Counter inputValueCounter,
                      RecordWriter<KEYOUT,VALUEOUT> output,
                      OutputCommitter committer,
                      StatusReporter reporter,
                      RawComparator<KEYIN> comparator,
                      Class<KEYIN> keyClass,
                      Class<VALUEIN> valueClass
                      ) throws InterruptedException, IOException{
   super(conf, taskid, output, committer, reporter);
   this.input = input;
   this.inputKeyCounter = inputKeyCounter;
   this.inputValueCounter = inputValueCounter;
   this.comparator = comparator;
   SerializationFactory serializationFactory = new SerializationFactory(conf);
   this.keyDeserializer = serializationFactory.getDeserializer(keyClass);
   this.keyDeserializer.open(buffer);
   this.valueDeserializer = serializationFactory.getDeserializer(valueClass);
   this.valueDeserializer.open(buffer);
   hasMore = input.next();
 }

 /** Start processing next unique key. */
 public boolean nextKey() throws IOException,InterruptedException {
   while (hasMore && nextKeyIsSame) {
     nextKeyValue();
   }
   if (hasMore) {
     if (inputKeyCounter != null) {
       inputKeyCounter.increment(1);
     }
     return nextKeyValue();
   } else {
     return false;
   }
 }

 
 public boolean nextKeyValue() throws IOException, InterruptedException {
   if (!hasMore) {
     key = null;
     value = null;
     return false;
   }
   firstValue = !nextKeyIsSame;
   DataInputBuffer next = input.getKey();
   currentRawKey.set(next.getData(), next.getPosition(), 
                     next.getLength() - next.getPosition());
   buffer.reset(currentRawKey.getBytes(), 0, currentRawKey.getLength());
   key = keyDeserializer.deserialize(key);
   next = input.getValue();
   buffer.reset(next.getData(), next.getPosition(),
       next.getLength() - next.getPosition());
   value = valueDeserializer.deserialize(value);
   hasMore = input.next();
   if (hasMore) {
     next = input.getKey();
     nextKeyIsSame = comparator.compare(currentRawKey.getBytes(), 0, 
                                        currentRawKey.getLength(),
                                        next.getData(),
                                        next.getPosition(),
                                        next.getLength() - next.getPosition()
                                        ) == 0;
   } else {
     nextKeyIsSame = false;
   }
   inputValueCounter.increment(1);
   return true;
 }

 public KEYIN getCurrentKey() {
   return key;
 }

 public VALUEIN getCurrentValue() {
   return value;
 }

 protected class ValueIterator implements Iterator<VALUEIN> {

   public boolean hasNext() {
     return firstValue || nextKeyIsSame;
   }

   @Override
   public VALUEIN next() {
     // if this is the first record, we don't need to advance
     if (firstValue) {
       firstValue = false;
       return value;
     }
     // if this isn't the first record and the next key is different, they
     // can't advance it here.
     if (!nextKeyIsSame) {
       throw new NoSuchElementException("iterate past last value");
     }
     // otherwise, go to the next key/value pair
     try {
       nextKeyValue();
       return value;
     } catch (IOException ie) {
       throw new RuntimeException("next value iterator failed", ie);
     } catch (InterruptedException ie) {
       // this is bad, but we can't modify the exception list of java.util
       throw new RuntimeException("next value iterator interrupted", ie);        
     }
   }

   public void remove() {
     throw new UnsupportedOperationException("remove not implemented");
   }
   
 }

 protected class ValueIterable implements Iterable<VALUEIN> {
   private ValueIterator iterator = new ValueIterator();
   @Override
   public Iterator<VALUEIN> iterator() {
     return iterator;
   } 
 }
 
 public 
 Iterable<VALUEIN> getValues() throws IOException, InterruptedException {
   return iterable;
 }
} 
    
  

如下是这两的继承结构

MapContext的方法汇总（不包括继承而来的，因为从继承结构可以看出MapContext与ReduceContext均继承TaskInputOutputContext，没有重写继承而来的方法，所以它们继承的都是一致的）

同理ReduceContext的方法汇总

上述的方法的用法都比较明显，不多说。

接下来就是看看共同继承的父类TaskInputOutputContext

有几个抽象方法：getCurrentKey() 、getCurrentValue() 、nextKeyValue() ，这是MapContext、ReduceContext共同的几个方法，务必需要MapContext与ReduceContext重新实现。write(KEYOUT key, VALUEOUT value) 则是把键值对写入DataOutput数据流中。在MapReduce编程过程中，不需要管理底层的数据流传输，write已经封装好了，直接调用即可写入流中。然后Hadoop会传输到下一步处理的环节。

从前面Mapper.Context、 Reducer.Context、MapContext、ReduceContext、TaskInputOutputContext、TaskAttemptContext均没有添加任何成员变量，都是使用祖先JobContext的成员变量，而JobContext的成员变量汇总如下：

 
    
  
package org.apache.hadoop.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;

/**
* A read-only view of the job that is provided to the tasks while they
* are running.
*/
public class JobContext {
 // Put all of the attribute names in here so that Job and JobContext are
 // consistent.
 protected static final String INPUT_FORMAT_CLASS_ATTR = 
   "mapreduce.inputformat.class";
 protected static final String MAP_CLASS_ATTR = "mapreduce.map.class";
 protected static final String COMBINE_CLASS_ATTR = "mapreduce.combine.class";
 protected static final String REDUCE_CLASS_ATTR = "mapreduce.reduce.class";
 protected static final String OUTPUT_FORMAT_CLASS_ATTR = 
   "mapreduce.outputformat.class";
 protected static final String PARTITIONER_CLASS_ATTR = 
   "mapreduce.partitioner.class";

 protected final org.apache.hadoop.mapred.JobConf conf;
 protected final Credentials credentials;
 private JobID jobId;

 public static final String JOB_NAMENODES = "mapreduce.job.hdfs-servers";

 public static final String JOB_ACL_VIEW_JOB = "mapreduce.job.acl-view-job";
 public static final String JOB_ACL_MODIFY_JOB =
   "mapreduce.job.acl-modify-job";

 public static final String CACHE_FILE_VISIBILITIES = 
   "mapreduce.job.cache.files.visibilities";
 public static final String CACHE_ARCHIVES_VISIBILITIES = 
   "mapreduce.job.cache.archives.visibilities";
 
 public static final String JOB_CANCEL_DELEGATION_TOKEN = 
   "mapreduce.job.complete.cancel.delegation.tokens";
 public static final String USER_LOG_RETAIN_HOURS = 
   "mapred.userlog.retain.hours";
 
 /**
  * The UserGroupInformation object that has a reference to the current user
  */
 protected UserGroupInformation ugi;
 
 public JobContext(Configuration conf, JobID jobId) {
   this.conf = new org.apache.hadoop.mapred.JobConf(conf);
   this.credentials = this.conf.getCredentials();
   this.jobId = jobId;
   try {
     this.ugi = UserGroupInformation.getCurrentUser();
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
 }

 void setJobID(JobID jobId) {
   this.jobId = jobId;
 }

 /**
  * Return the configuration for the job.
  * @return the shared configuration object
  */
 public Configuration getConfiguration() {
   return conf;
 }

 /**
  * Get credentials for the job.
  * @return credentials for the job
  */
 public Credentials getCredentials() {
   return credentials;
 }

 /**
  * Get the unique ID for the job.
  * @return the object with the job id
  */
 public JobID getJobID() {
   return jobId;
 }
 
 /**
  * Get configured the number of reduce tasks for this job. Defaults to 
  * <code>1</code>.
  * @return the number of reduce tasks for this job.
  */
 public int getNumReduceTasks() {
   return conf.getNumReduceTasks();
 }
 
 /**
  * Get the current working directory for the default file system.
  * 
  * @return the directory name.
  */
 public Path getWorkingDirectory() throws IOException {
   return conf.getWorkingDirectory();
 }

 /**
  * Get the key class for the job output data.
  * @return the key class for the job output data.
  */
 public Class<?> getOutputKeyClass() {
   return conf.getOutputKeyClass();
 }
 
 /**
  * Get the value class for job outputs.
  * @return the value class for job outputs.
  */
 public Class<?> getOutputValueClass() {
   return conf.getOutputValueClass();
 }

 /**
  * Get the key class for the map output data. If it is not set, use the
  * (final) output key class. This allows the map output key class to be
  * different than the final output key class.
  * @return the map output key class.
  */
 public Class<?> getMapOutputKeyClass() {
   return conf.getMapOutputKeyClass();
 }

 /**
  * Get the value class for the map output data. If it is not set, use the
  * (final) output value class This allows the map output value class to be
  * different than the final output value class.
  *  
  * @return the map output value class.
  */
 public Class<?> getMapOutputValueClass() {
   return conf.getMapOutputValueClass();
 }

 /**
  * Get the user-specified job name. This is only used to identify the 
  * job to the user.
  * 
  * @return the job's name, defaulting to "".
  */
 public String getJobName() {
   return conf.getJobName();
 }

 /**
  * Get the {@link InputFormat} class for the job.
  * 
  * @return the {@link InputFormat} class for the job.
  */
 @SuppressWarnings("unchecked")
 public Class<? extends InputFormat<?,?>> getInputFormatClass() 
    throws ClassNotFoundException {
   return (Class<? extends InputFormat<?,?>>) 
     conf.getClass(INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class);
 }

 /**
  * Get the {@link Mapper} class for the job.
  * 
  * @return the {@link Mapper} class for the job.
  */
 @SuppressWarnings("unchecked")
 public Class<? extends Mapper<?,?,?,?>> getMapperClass() 
    throws ClassNotFoundException {
   return (Class<? extends Mapper<?,?,?,?>>) 
     conf.getClass(MAP_CLASS_ATTR, Mapper.class);
 }

 /**
  * Get the combiner class for the job.
  * 
  * @return the combiner class for the job.
  */
 @SuppressWarnings("unchecked")
 public Class<? extends Reducer<?,?,?,?>> getCombinerClass() 
    throws ClassNotFoundException {
   return (Class<? extends Reducer<?,?,?,?>>) 
     conf.getClass(COMBINE_CLASS_ATTR, null);
 }

 /**
  * Get the {@link Reducer} class for the job.
  * 
  * @return the {@link Reducer} class for the job.
  */
 @SuppressWarnings("unchecked")
 public Class<? extends Reducer<?,?,?,?>> getReducerClass() 
    throws ClassNotFoundException {
   return (Class<? extends Reducer<?,?,?,?>>) 
     conf.getClass(REDUCE_CLASS_ATTR, Reducer.class);
 }

 /**
  * Get the {@link OutputFormat} class for the job.
  * 
  * @return the {@link OutputFormat} class for the job.
  */
 @SuppressWarnings("unchecked")
 public Class<? extends OutputFormat<?,?>> getOutputFormatClass() 
    throws ClassNotFoundException {
   return (Class<? extends OutputFormat<?,?>>) 
     conf.getClass(OUTPUT_FORMAT_CLASS_ATTR, TextOutputFormat.class);
 }

 /**
  * Get the {@link Partitioner} class for the job.
  * 
  * @return the {@link Partitioner} class for the job.
  */
 @SuppressWarnings("unchecked")
 public Class<? extends Partitioner<?,?>> getPartitionerClass() 
    throws ClassNotFoundException {
   return (Class<? extends Partitioner<?,?>>) 
     conf.getClass(PARTITIONER_CLASS_ATTR, HashPartitioner.class);
 }

 /**
  * Get the {@link RawComparator} comparator used to compare keys.
  * 
  * @return the {@link RawComparator} comparator used to compare keys.
  */
 public RawComparator<?> getSortComparator() {
   return conf.getOutputKeyComparator();
 }

 /**
  * Get the pathname of the job's jar.
  * @return the pathname
  */
 public String getJar() {
   return conf.getJar();
 }

 /** 
  * Get the user defined {@link RawComparator} comparator for 
  * grouping keys of inputs to the reduce.
  * 
  * @return comparator set by the user for grouping values.
  * @see Job#setGroupingComparatorClass(Class) for details.  
  */
 public RawComparator<?> getGroupingComparator() {
   return conf.getOutputValueGroupingComparator();
 }
}