https://blog.csdn.net/groovy2007/article/details/44408583
《hadoop权威指南》里的例子:有许多气象记录,需要找出每年的最高温度,通过secondary sort实现。(这个任务其实没有必要使用secondary sort,这里只是为了演示)
map-reduce的shuffle阶段,只会根据key进行排序,而同一个key的value是无序的,所以要把年份和温度都放在key里面。map的输出:key为year-temperature pair,value为null。job的partitioner设为根据年份进行hash(将同一年份的数据发送到同一个reducer)。job的sortComparator根据年份和温度进行比较。job的groupingComparator只根据年份进行比较。完整代码在此。
reducer的代码很简单,只有一行:将key写出即可。输出就是每个年份以及当年的最高气温。
- void reduce(IntPair key, Iterable<NullWritable> values, Context context) {
- context.write(key, NullWritable.get());
- }
groupingComparator的作用是什么?书中对此语焉不详,只说将相同年份的数据分到一组。网上的解释也大多是错误的。
误解:reducer会根据groupingComparator对数据进行排序。略一思考就会发现这不合逻辑,如果reduce需要排序的话,那么map阶段的排序就没有必要了,反正到了reducer这里还会被打乱。
hadoop官方文档对groupingComparator的解释是:controls which keys are grouped together for a single call to Reducer.reduce()。那么到底是如何control的呢?看下面的例子。样本数据如下,第一列是key,第二列是value。
1 12 4 41 5 52 8 82 1 11 1 13 8 83 8 81 5 51 5 53 4 43 4 42
比较一下三种操作的输出:
不指定groupingComparator: 1:12,11,13, 4:41,43,42, 5:52,51,53, 8:82,83,81, 指定Less5GroupComparator,即将小于5的key分为一组,其余一组: 1:12,11,13,41,43,42, 5:52,51,53,82,83,81, 指定Sum9GroupComparator,即相加等于9的key分为一组: 1:12,11,13, 4:41,43,42,52,51,53, 8:82,83,81,
代码如下:
- import java.io.IOException;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.io.WritableComparable;
- import org.apache.hadoop.io.WritableComparator;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- public class GroupingComparatorExample {
- public static class MyMapper
- extends Mapper {
- public void map(LongWritable key, Text value, Context context)
- throws IOException, InterruptedException {
- String[] fields = value.toString().split(" ");
- if(fields.length==2) {
- int k = Integer.parseInt(fields[0]);
- int v = Integer.parseInt(fields[1]);
- context.write(new IntWritable(k), new IntWritable(v));
- }
- }
- }
- public static class MyReducer
- extends Reducer {
- public void reduce(IntWritable key, Iterable values, Context context)
- throws IOException, InterruptedException {
- System.out.print(key+":");
- for(IntWritable v : values)
- System.out.print(v+",");
- System.out.println();
- }
- }
- public static class Sum9GroupComparator extends WritableComparator {
- protected Sum9GroupComparator() {
- super(IntWritable.class, true);
- }
- public int compare(WritableComparable w1, WritableComparable w2) {
- IntWritable i1 = (IntWritable) w1;
- IntWritable i2 = (IntWritable) w2;
- if(i1.get()+i2.get() == 9) return 0;
- return i1.compareTo(i2);
- }
- }
- public static class Less5GroupComparator extends WritableComparator {
- protected Less5GroupComparator() {
- super(IntWritable.class, true);
- }
- public int compare(WritableComparable w1, WritableComparable w2) {
- int i1 = ((IntWritable) w1).get();
- int i2 = ((IntWritable) w2).get();
- if(i1<5 && i2<5) return 0;
- if(i1>=5 && i2>=5) return 0;
- return Integer.compare(i1, i2);
- }
- }
- public static void main(String[] args) throws Exception {
- if (args.length != 2) {
- System.err.println("Usage: GroupingComparatorExample <input>");
- System.exit(-1);
- }
- Job job = new Job();
- job.setJarByClass(GroupingComparatorExample.class);
- job.setJobName("GroupingComparatorExample");
- FileInputFormat.addInputPath(job, new Path(args[0]));
- FileOutputFormat.setOutputPath(job, new Path(args[1]));
- job.setMapperClass(MyMapper.class);
- job.setReducerClass(MyReducer.class);
- job.setOutputKeyClass(IntWritable.class);
- job.setOutputValueClass(IntWritable.class);
- //job.setGroupingComparatorClass(Less5GroupComparator.class);
- //job.setGroupingComparatorClass(Sum9GroupComparator.class);
- System.exit(job.waitForCompletion(true) ? 0 : 1);
- }
- }
从第一组输出可以看到:数据已经按key排序,同一个key里的value是无序的。另外可以推测,mapper使用的排序算法是稳定的,因为同一个key的values的顺序与输入相同。
第二组输出已经正确地将数据分成了两组,并且每一组的key是该组里面key的第一个值。(即key为1和5,而不是4和8)
第三组,key为4和5的已经正确地归为了一组,但是1和8呢。实际上goupingComparator并不进行任何排序操作,只是依次取出reducer收到的key-value对,然后比较当前key与前一个key,如果比较的结果为0,就认为是同一个group,对同一个group里的数据进行一次reduce调用。
org.apache.hadoop.mapreduce.Reducer的实现已经明白无误地说明了这一点:
- //以下代码来自hadoop-2.4.1
- //org.apache.hadoop.mapreduce.Reducer.run()主要代码如下
- public void run(Context context) throws IOException, InterruptedException {
- setup(context);
- try {
- while (context.nextKey()) {
- reduce(context.getCurrentKey(), context.getValues(), context);
- }
- } finally {
- cleanup(context);
- }
- }
- //context.getValues()将会使用如下的迭代器进行遍历
- //ReduceContextImpl.ValueIterator
- protected class ValueIterator implements ReduceContext.ValueIterator {
- public boolean hasNext() {
- return firstValue || nextKeyIsSame;
- }
- public VALUEIN next() {
- // if this is the first record, we don't need to advance
- if (firstValue) {
- firstValue = false;
- return value;
- }
- // if this isn't the first record and the next key is different, they
- // can't advance it here.
- if (!nextKeyIsSame) {
- throw new NoSuchElementException("iterate past last value");
- }
- // otherwise, go to the next key/value pair
- nextKeyValue();
- return value;
- }
- }
- //这里的comparator就是job.setGroupingComparator()时设置的
- //org.apache.hadoop.mapreduce.task.ReduceContextImpl.nextKeyValue()
- public boolean nextKeyValue() throws IOException, InterruptedException {
- //...
- hasMore = input.next();
- if (hasMore) {
- nextKey = input.getKey();
- nextKeyIsSame = comparator.compare(currentRawKey.getBytes(), 0, currentRawKey.getLength(),
- nextKey.getData(), nextKey.getPosition(), nextKey.getLength() - nextKey.getPosition()
- ) == 0;
- } else {
- nextKeyIsSame = false;
- }
- inputValueCounter.increment(1);
- }
另外,如果不设置groupingComparator的话,使用的就是map阶段排序用的comparator。
- // org.apache.hadoop.mapreduce.Job
- public RawComparator getOutputValueGroupingComparator() {
- Class<? extends RawComparator> theClass = getClass(
- JobContext.GROUP_COMPARATOR_CLASS, null, RawComparator.class);
- if (theClass == null) {
- return getOutputKeyComparator();
- }
- return ReflectionUtils.newInstance(theClass, this);
- }
package com.lyl.hello; import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class GroupingComparatorExample { public static class MyMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> { public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] fields = value.toString().split(" "); System.out.println("fields is :" + fields); if(fields.length==2) { int k = Integer.parseInt(fields[0]); int v = Integer.parseInt(fields[1]); context.write(new IntWritable(k), new IntWritable(v)); } } } public static class MyReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> { public void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { System.out.print(key+":"); for( IntWritable v : values) System.out.print(key+"&"+v+","); System.out.println(); } } public static class Sum9GroupComparator extends WritableComparator { protected Sum9GroupComparator() { super(IntWritable.class, true); } public int compare(WritableComparable w1, WritableComparable w2) { IntWritable i1 = (IntWritable) w1; IntWritable i2 = (IntWritable) w2; if(i1.get()+i2.get() == 9) return 0; return i1.compareTo(i2); } } public static class Less5GroupComparator extends WritableComparator { protected Less5GroupComparator() { super(IntWritable.class, true); } public int compare(WritableComparable w1, WritableComparable w2) { int i1 = ((IntWritable) w1).get(); int i2 = ((IntWritable) w2).get(); if(i1<5 && i2<5) return 0; if(i1>=5 && i2>=5) return 0; return Integer.compare(i1, i2); } } public void run() throws IOException, InterruptedException, ClassNotFoundException{ String inputPath="file:///Users/user/Documents/LYL/GroupingComparatorExample.txt"; String outpuPath="file:///Users/user/Documents/LYL/GroupingComparatorExample_out"; System.out.println("inpath is :" + inputPath); Job job = new Job(); job.setJarByClass(GroupingComparatorExample.class); job.setJobName("GroupingComparatorExample"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outpuPath)); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); //job.setGroupingComparatorClass(Less5GroupComparator.class); job.setGroupingComparatorClass(Sum9GroupComparator.class); System.out.println("*********** :" ); job.waitForCompletion(true); //System.exit(job.waitForCompletion(true) ? 0 : 1); } public static void main(String[] args) throws Exception { /*if (args.length != 2) { System.err.println("Usage: GroupingComparatorExample <input>"); System.exit(-1); }*/ GroupingComparatorExample gce=new GroupingComparatorExample(); gce.run(); } }