MapReduce实现二次排序及GroupingComparator的误区

最新推荐文章于 2020-06-30 15:29:14 发布

some_321

最新推荐文章于 2020-06-30 15:29:14 发布

阅读量379

点赞数

分类专栏： MapReduce 文章标签：大数据 java

MapReduce 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

之前在网上看到了一篇使用MapReduce实现二次排序的博客，自己尝试实现了，并测试成功，代码有所改动。链接如下：

http://blog.csdn.net/zyj8170/article/details/7530728

所谓的二次排序：对Key和Val都进行排序（比如升序），并输出。对Key的自动排序，MapReduce可以替我们解决，但是同时对Val进行排序，则需要其他的做法。

做法一：对每个Key的所有Val，添加到ArrayList，使用Collections.sort方法进行排序，虽然能够实现，但是存在隐患，如果key的val值很多，那么对每个Key的ArrayList消耗的内存就很大，效率不高；

做法二：定制MapReduce的IO类型，定制GroupingComparator类，实现二次排序，通过定制，将默认的MR执行的方式改成自定义的，效率比较高，易于扩展；

现在开始实现做法二。

一、准备待排序的数据（数据跟原博客一样），见第三部分的结果对比：

二、代码如下

package com;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer;

public class SecondSort extends Configured implements Tool {

//自定义的类型，（参见本博客的MapReduce定制类型）
    static class IntPair implements WritableComparable<IntPair> {
        private int a;
        private int b;

        public IntPair() {
            a = 0;
            b = 0;
        }

        public int getA() {
            return a;
        }

        public void setA(int a) {
            this.a = a;
        }

        public int getB() {
            return b;
        }

        public void setB(int b) {
            this.b = b;
        }

        public void set(int a, int b) {
            this.a = a;
            this.b = b;
        }

        @Override
        public int compareTo(IntPair o) {
            if (this.a == o.a) {
                if (this.b == o.b)
                    return 0;
                else
                    return this.b > o.b ? 1 : -1;
            } else
                return this.a > o.a ? 1 : -1;
        }

        @Override
        public void write(DataOutput dataOutput) throws IOException {
            dataOutput.writeInt(a);
            dataOutput.writeInt(b);
        }

        @Override
        public void readFields(DataInput dataInput) throws IOException {
            a = dataInput.readInt();
            b = dataInput.readInt();
        }
    }

    static class Map extends Mapper<LongWritable, Text, IntPair, IntWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer strTok = new StringTokenizer(value.toString());
            int a = Integer.parseInt(strTok.nextToken());
            int b = Integer.parseInt(strTok.nextToken());
            IntPair mykey = new IntPair();
            mykey.set(a, b);
            context.write(mykey, new IntWritable(b));
        }
    }

    static class MyKeyGroupComparator extends WritableComparator {
        MyKeyGroupComparator() {
            super(IntPair.class, true);
        }

        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            IntPair ip1 = (IntPair) a;
            IntPair ip2 = (IntPair) b;
            if (ip1.a == ip2.a)
                return 0;
            else
                return ip1.a > ip2.a ? 1 : -1;
        }
    }

    static class Reduce extends Reducer<IntPair, IntWritable, IntWritable, IntWritable> {
        @Override
        protected void reduce(IntPair key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            IntWritable myKey = new IntWritable(key.getA());
          //显式的分隔分组，便于查看
            context.write(new IntWritable(999999999), null);
            for(IntWritable i :values){
                context.write(myKey, i);
            }
        }
    }

    @Override
    public int run(String[] strings) throws Exception {
        //path是HDFS的路径字符串
        String path = "/my/inputTest/Test_SecondSort.txt";
        if (strings.length != 1) {
            System.out.println("input:" + path);
            System.out.print("arg:<out>");
            return 1;
        }
        Configuration conf = getConf();
        Job job = new Job(conf, "SecondSort");
        job.setJarByClass(SecondSort.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setGroupingComparatorClass(MyKeyGroupComparator.class);

        job.setMapOutputKeyClass(IntPair.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);

        job.setNumReduceTasks(1);

        FileInputFormat.addInputPath(job, new Path(path));
        FileOutputFormat.setOutputPath(job, new Path(strings[0]));

        boolean success = job.waitForCompletion(true);
        return success ? 0 : 1;

    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        int rst = ToolRunner.run(conf, new SecondSort(), args);
        System.exit(rst);
    }
}

三、见结果对比表，结果为结果数据1（999999999是Group分隔符）：

四、如果将Group的标准从根据Inpair的a的值判断改成b的值判断，Group部分代码修改如下（其余不变）：

    static class MyKeyGroupComparator extends WritableComparator {
        MyKeyGroupComparator() {
            super(IntPair.class, true);
        }

        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            IntPair ip1 = (IntPair) a;
            IntPair ip2 = (IntPair) b;
//            if (ip1.a == ip2.a)
//                return 0;
//            else
//                return ip1.a > ip2.a ? 1 : -1;
            if (ip1.b == ip2.b)
                return 0;
            else
                return ip1.b > ip2.b ? 1 : -1;
        }
    }

结果见结果对比表的结果数据2.

五、结果对比表：

原始数据

结果数据1

结果数据2

结果说明

20 21

50 51

50 52

50 53

50 54

60 51