Map
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {
long max = Long.MIN_VALUE;
protected void map(LongWritable key, Text value,Context context)
throws java.io.IOException, InterruptedException {
Long current = Long.parseLong(value.toString());
if (current > max) {
max = current;
}
};
//当mapper结束的时候执行的,因为每一行都执行一次map函数,所以不能在map函数里写context.write
protected void cleanup(
org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, LongWritable, NullWritable>.Context context)
throws java.io.IOException, InterruptedException {
context.write(new LongWritable(max), NullWritable.get());
};
}
需要注意的是map和reduce类中都有个cleanup方法,也就是当map和reduce执行完任务后会去执行cleanup方法。
Reduce
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
/**
* 因为reduce接收的是从多个map任务输出的结果,所以在reduce中还要进行一个比较大小的操作
*
*/
public class MyReduce extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {
long max = Long.MIN_VALUE;
protected void reduce(LongWritable key2, java.lang.Iterable<NullWritable> values2,
org.apache.hadoop.mapreduce.Reducer<LongWritable, NullWritable, LongWritable, NullWritable>.Context context)
throws java.io.IOException, InterruptedException {
long current = key2.get();
if (current > max) {
max = current;
}
};
// 由于一个mapper输出执行一次reduce,所以只有在reduce全部执行完毕后才晓得最大值
protected void cleanup(
org.apache.hadoop.mapreduce.Reducer<LongWritable, NullWritable, LongWritable, NullWritable>.Context context)
throws java.io.IOException, InterruptedException {
context.write(new LongWritable(max), NullWritable.get());
};
}
测试类
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
public class TestMaxNum {
private static final String INPUT_PATH = "hdfs://xxc:9000/input";
private static final String OUT_PATH = "hdfs://xxc:9000/out";
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
Configuration conf = new Configuration();
FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
Path outPath = new Path(OUT_PATH);
if(fileSystem.exists(outPath)){
fileSystem.delete(outPath, true);
}
Job job = new Job(conf,TestMaxNum.class.getSimpleName());
//1.1 指定读取的文件路径
FileInputFormat.setInputPaths(job, INPUT_PATH);
//指定如何对输入文件进行格式化,把输入文件每一行解析成键值对
job.setInputFormatClass(TextInputFormat.class);
//1.2 指定自定义的map类
job.setMapperClass(MyMapper.class);
//指定map输出的<k,v>类型.如果<k3,v3>的类型和<k2,v2>类型一致,则可以省略
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(NullWritable.class);
//1.3 分区
job.setPartitionerClass(HashPartitioner.class);
//有一个reduce任务运行
job.setNumReduceTasks(1);
//1.4 TODO 排序、分组
//1.5 规约
job.setCombinerClass(MyReduce.class);
//2.2 指定自定义reduce类
job.setReducerClass(MyReduce.class);
//指定reduce的输出类型
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(NullWritable.class);
//2.3 指定输出到哪个路径
FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
job.setOutputFormatClass(TextOutputFormat.class);
//把job提交给JobTracker运行
job.waitForCompletion(true);
}
}