package org.apache.hadoop.mapreduce;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.compress.CompressionCodec;
public class Mapper {
public class Context
extends MapContext {
public Context(Configuration conf, TaskAttemptID taskid,
RecordReader reader,
RecordWriter writer,
OutputCommitter committer,
StatusReporter reporter,
InputSplit split) throws IOException, InterruptedException {
super(conf, taskid, reader, writer, committer, reporter, split);
}
}
protected void setup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
@SuppressWarnings("unchecked")
protected void map(KEYIN key, VALUEIN value,
Context context) throws IOException, InterruptedException {
context.write((KEYOUT) key, (VALUEOUT) value);
}
protected void cleanup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
public void run(Context context) throws IOException, InterruptedException {
setup(context);
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
cleanup(context);
}
}
以下是我看了源码之后的个人理解,很多地方我也不理解,第一次读源码,本身基础也不扎实,今后看了别人的说法之后还会渐渐补充,我也不知道对与错,希望看到的人对我的说法予以指正,感激不尽!
这段代码首先是一个内部类,
public class Context
extends MapContext {
public Context(Configuration conf, TaskAttemptID taskid,
RecordReader reader,
RecordWriter writer,
OutputCommitter committer,
StatusReporter reporter,
InputSplit split) throws IOException, InterruptedException {
super(conf, taskid, reader, writer, committer, reporter, split);
}
}
这个类是继承了MapContext,而MapContext又继承了TaskAttemptContext,TaskAttemptContext继承了TaskAttemptContext,TaskAttemptContext继承了JobContext,这几个类逐渐把context中的内容初始化了,具体的初始化以后再慢慢理解。
接下来是里面的4个方法体,
protected void setup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
@SuppressWarnings("unchecked")
protected void map(KEYIN key, VALUEIN value,
Context context) throws IOException, InterruptedException {
context.write((KEYOUT) key, (VALUEOUT) value);
}
protected void cleanup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
public void run(Context context) throws IOException, InterruptedException {
setup(context);
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
cleanup(context);
}
按照里面给的说明,我有了一个初步的理解,setup是在Mapper类一被加载的时候就先调用,但你不覆写它,它是没什么作用的,反正到现在为止我还没初始化过;
接下来是map函数,这个方法在内部已经有了一个初始化的方法,context.write((KEYOUT) key, (VALUEOUT) value);也就是你提交什么,它输出什么,不过我们一般都要覆写这个方法,这个是Mapper的主要方法。
接下来会执行cleanup方法,这个不覆写,里面也是什么都没有。
最后是run方法,它里面是这么说明的,Applications may override the {@link #run(Context)} method to exert greater control on map processing e.g. multi-threaded Mappers etc.看了这个源码之后才知道,原来Mapper里面真正执行的是run函数。
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.compress.CompressionCodec;
public class Mapper {
public class Context
extends MapContext {
public Context(Configuration conf, TaskAttemptID taskid,
RecordReader reader,
RecordWriter writer,
OutputCommitter committer,
StatusReporter reporter,
InputSplit split) throws IOException, InterruptedException {
super(conf, taskid, reader, writer, committer, reporter, split);
}
}
protected void setup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
@SuppressWarnings("unchecked")
protected void map(KEYIN key, VALUEIN value,
Context context) throws IOException, InterruptedException {
context.write((KEYOUT) key, (VALUEOUT) value);
}
protected void cleanup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
public void run(Context context) throws IOException, InterruptedException {
setup(context);
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
cleanup(context);
}
}
以下是我看了源码之后的个人理解,很多地方我也不理解,第一次读源码,本身基础也不扎实,今后看了别人的说法之后还会渐渐补充,我也不知道对与错,希望看到的人对我的说法予以指正,感激不尽!
这段代码首先是一个内部类,
public class Context
extends MapContext {
public Context(Configuration conf, TaskAttemptID taskid,
RecordReader reader,
RecordWriter writer,
OutputCommitter committer,
StatusReporter reporter,
InputSplit split) throws IOException, InterruptedException {
super(conf, taskid, reader, writer, committer, reporter, split);
}
}
这个类是继承了MapContext,而MapContext又继承了TaskAttemptContext,TaskAttemptContext继承了TaskAttemptContext,TaskAttemptContext继承了JobContext,这几个类逐渐把context中的内容初始化了,具体的初始化以后再慢慢理解。
接下来是里面的4个方法体,
protected void setup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
@SuppressWarnings("unchecked")
protected void map(KEYIN key, VALUEIN value,
Context context) throws IOException, InterruptedException {
context.write((KEYOUT) key, (VALUEOUT) value);
}
protected void cleanup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
public void run(Context context) throws IOException, InterruptedException {
setup(context);
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
cleanup(context);
}
按照里面给的说明,我有了一个初步的理解,setup是在Mapper类一被加载的时候就先调用,但你不覆写它,它是没什么作用的,反正到现在为止我还没初始化过;
接下来是map函数,这个方法在内部已经有了一个初始化的方法,context.write((KEYOUT) key, (VALUEOUT) value);也就是你提交什么,它输出什么,不过我们一般都要覆写这个方法,这个是Mapper的主要方法。
接下来会执行cleanup方法,这个不覆写,里面也是什么都没有。
最后是run方法,它里面是这么说明的,Applications may override the {@link #run(Context)} method to exert greater control on map processing e.g. multi-threaded Mappers etc.看了这个源码之后才知道,原来Mapper里面真正执行的是run函数。