hadoop 二次排序 group函数的作用的说明

最新推荐文章于 2022-08-29 23:30:00 发布

古巴与八股

最新推荐文章于 2022-08-29 23:30:00 发布

阅读量633

点赞数

分类专栏： hadoop 大数据 mapreduce

本文链接：https://blog.csdn.net/xuedingkai/article/details/78197354

版权

大数据同时被 3 个专栏收录

9 篇文章 0 订阅

订阅专栏

hadoop

6 篇文章 0 订阅

订阅专栏

mapreduce

2 篇文章 0 订阅

订阅专栏

hadoop mapreduce作业通过组合key实现二次排序的过程中，只要实现组合key的类就可以了。mapreduce框架本身会基于key对输出进行排序。

而partion函数只为了是实现数据规模较大时，对map的输出实现分区。为启动多个reduce任务做准备。

group函数也是可有可无的。

group函数的作用是对key进行分组，例如对于map的结果：

[(k1, k21), v1]

[(k1, k22), v2]

[(k1, k23), v3]

通过设定group函数可以做到按组合key的k1进行分组：

[[(k1, k21), (k1, k22), (k1, k23)], [v1, v2, v3]]

下面验证结果记录：

输入文件file1：

输入文件file2：

实现的完整代码：

package hadoop;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.KeyValue.RawBytesComparator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class DDSort {
    public static class Map extends Mapper<LongWritable, Text, DataPair, IntWritable>{
        @Override
        protected void map(
                LongWritable key,
                Text value,
                Mapper<LongWritable, Text, DataPair, IntWritable>.Context context)
                throws IOException, InterruptedException {
            String str = value.toString();
            String [] sz = str.split("\t");
            if(sz.length == 2){
                int v1 = Integer.parseInt(sz[0]);
                int v2 = Integer.parseInt(sz[1]);
                DataPair dp = new DataPair(v1, v2);
                //context.write(new IntWritable(v1), new IntWritable(v2));
                context.write(dp, new IntWritable(v2));
            }
        }
    }
    
    public static class DataPair implements WritableComparable<DataPair> {
        private int v1;
        private int v2;
        
        public DataPair(){
            
        }
        
        public DataPair(int arg1, int arg2){
            v1 = arg1;
            v2 = arg2;
        }
        
        public int getV1() {
            return v1;
        }

        public void setV1(int v1) {
            this.v1 = v1;
        }

        public int getV2() {
            return v2;
        }

        public void setV2(int v2) {
            this.v2 = v2;
        }
        
        @Override
        public String toString() {
            return new Integer(v1).toString() + " " + new Integer(v2).toString(); 
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            // TODO Auto-generated method stub
            v1 = in.readInt();
            v2 = in.readInt();
        }

        @Override
        public void write(DataOutput out) throws IOException {
            // TODO Auto-generated method stub
            out.writeInt(v1);
            out.writeInt(v2);
        }

        @Override
        public int compareTo(DataPair o) {
            int tmp = v1 - o.getV1();
            if(tmp != 0){
                return tmp;
            }
            return v2 - o.getV2();
        }
        
        
        static {
            //WritableComparator.define(DataPair.class, new Comparator());
        }
    }
    
//    public class Comparator extends WritableComparator{
//        public Comparator() {
//            super(DataPair.class);
//        }
//        
//        @Override
//        public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3,
//                int arg4, int arg5) {
//            // TODO Auto-generated method stub
//            return super.compare(arg0, arg1, arg2, arg3, arg4, arg5);
//        }     
//    }
    
    public static class Prt extends Partitioner<DataPair, IntWritable>{
        @Override
        public int getPartition(DataPair arg0, IntWritable arg1, int arg2) {
            
            return arg0.getV1()%arg2;
        }
    }
    
    public static class Grp implements RawComparator<DataPair>{
        @Override
        public int compare(DataPair o1, DataPair o2) {
            return o1.v1 - o2.v1;
        }
        
        @Override
        public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3,
                int arg4, int arg5) {

            return WritableComparator.compareBytes(arg0, arg1, 4, arg3, arg4, 4);
        }
    }
    
    public static class Grp1 extends WritableComparator{

        protected Grp1() {
            super(DataPair.class);
        }
        
        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            DataPair d1 = (DataPair)a;
            DataPair d2 = (DataPair)b;
            return Integer.compare(d1.getV1(), d2.getV1());
        }
    }
    
    public static class Reduce extends Reducer<DataPair, IntWritable, DataPair, IntWritable>{
        @Override
        protected void reduce(
                DataPair key,
                Iterable<IntWritable> value,
                Reducer<DataPair, IntWritable, DataPair, IntWritable>.Context context)
                throws IOException, InterruptedException {
            //context.write(key, new IntWritable(1));
            int test = 1000; //for test
            for(IntWritable i : value){
                context.write(key, i);
            }
            System.out.println(++test); //for test
            context.write(key, new IntWritable(test)); //for test
        }
    }
    
    private static String inputPath = "in-ddsort";
    private static String outputPath = "out-ddsort";
    
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args)
                .getRemainingArgs();
        
        Job job = new Job(conf, "DDSort");
        job.setJarByClass(DDSort.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        
        job.setPartitionerClass(Prt.class);
        job.setNumReduceTasks(6);
        //job.setGroupingComparatorClass(Grp.class);
        
        job.setMapOutputKeyClass(DataPair.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(DataPair.class);
        job.setOutputValueClass(IntWritable.class);
        
        
        FileSystem fs = FileSystem.get(conf);
        Path outPath = new Path(outputPath);
        if (fs.exists(outPath)) {
            fs.delete(outPath, true);
        }
        
        FileInputFormat.addInputPath(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

1、不配值partition和group函数的情况：

	public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args)
                .getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: wordcount <in> <out>");
            System.exit(2);
        }
        Job job = new Job(conf, "DDSort");
        job.setJarByClass(DDSort.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        
        //job.setPartitionerClass(Prt.class);
        //job.setNumReduceTasks(6);
        //job.setGroupingComparatorClass(Grp.class);
        
        job.setMapOutputKeyClass(DataPair.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(DataPair.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

对应的输出结果为只有一个输出文件：

-rw-r--r--   3 root supergroup          0 2017-10-10 21:02 /user/root/out-ddsort21/_SUCCESS
drwxr-xr-x   - root supergroup          0 2017-10-10 21:01 /user/root/out-ddsort21/_logs
-rw-r--r--   3 root supergroup         78 2017-10-10 21:02 /user/root/out-ddsort21/part-r-00000
part-r-00000内容为：

1    2   2
1    2   1001
1    4   4
1    4   1001
1    9   9
1    9   1001
2    1   1
2    1   1001
2    5   5
2    5   1001
2    6   6
2    6   1001
3    1   1
3    1   1001
4    1   1
4    1   1001
4    3    3
4    3   1001
5    5   5
5    5   1001
6    1   1
6    1   1001
6    4   4
6    4   1001
6    8   8
6    8   1001

可见一个[(k1, k21), v1]属于一组。

2、只配置partion

	public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args)
                .getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: wordcount <in> <out>");
            System.exit(2);
        }
        Job job = new Job(conf, "DDSort");
        job.setJarByClass(DDSort.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        
        job.setPartitionerClass(Prt.class);
        job.setNumReduceTasks(6);
        //job.setGroupingComparatorClass(Grp.class);
        
        job.setMapOutputKeyClass(DataPair.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(DataPair.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

对应的输出结果为：

Found 8 items
-rw-r--r--   3 root supergroup          0 2017-10-10 21:34 /user/root/out-ddsort22/_SUCCESS
drwxr-xr-x   - root supergroup          0 2017-10-10 21:33 /user/root/out-ddsort22/_logs
-rw-r--r--   3 root supergroup         18 2017-10-10 21:33 /user/root/out-ddsort22/part-r-00000
-rw-r--r--   3 root supergroup         18 2017-10-10 21:33 /user/root/out-ddsort22/part-r-00001
-rw-r--r--   3 root supergroup         18 2017-10-10 21:33 /user/root/out-ddsort22/part-r-00002
-rw-r--r--   3 root supergroup          6 2017-10-10 21:33 /user/root/out-ddsort22/part-r-00003
-rw-r--r--   3 root supergroup         12 2017-10-10 21:33 /user/root/out-ddsort22/part-r-00004
-rw-r--r--   3 root supergroup          6 2017-10-10 21:33 /user/root/out-ddsort22/part-r-00005

part-r-00000：

6    1   1
6    1   1001
6    4   4
6 4   1001
6    8   8
6    8   1001

part-r-00001：

1    2   2
1    2   1001
1    4   4
1    4   1001
1    9   9
1    9    1001

part-r-00003：

2    1   1
2    1   1001
2    5   5
2    5   1001
2    6   6
2    6   1001

.......

可见一个[(k1, k21), v1]属于一组。

3、配置partiton和group

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args)
                .getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: wordcount <in> <out>");
            System.exit(2);
        }
        Job job = new Job(conf, "DDSort");
        job.setJarByClass(DDSort.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        
        job.setPartitionerClass(Prt.class);
        job.setNumReduceTasks(6);
        job.setGroupingComparatorClass(Grp.class);
        
        job.setMapOutputKeyClass(DataPair.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(DataPair.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

输出结果：

Found 8 items
-rw-r--r--   3 root supergroup          0 2017-10-10 22:04 /user/root/out-ddsort23/_SUCCESS
drwxr-xr-x   - root supergroup          0 2017-10-10 22:03 /user/root/out-ddsort23/_logs
-rw-r--r--   3 root supergroup         18 2017-10-10 22:03 /user/root/out-ddsort23/part-r-00000
-rw-r--r--   3 root supergroup         18 2017-10-10 22:03 /user/root/out-ddsort23/part-r-00001
-rw-r--r--   3 root supergroup         18 2017-10-10 22:03 /user/root/out-ddsort23/part-r-00002
-rw-r--r--   3 root supergroup          6 2017-10-10 22:03 /user/root/out-ddsort23/part-r-00003
-rw-r--r--   3 root supergroup         12 2017-10-10 22:03 /user/root/out-ddsort23/part-r-00004
-rw-r--r--   3 root supergroup          6 2017-10-10 22:03 /user/root/out-ddsort23/part-r-00005

part-r-00000：

6    1   1
6    4   4
6    8   8
6    8   1001

part-r-00001：

1    2   2
1    4   4
1    9   9
1    9   1001

part-r-00002：

2    1   1
2    5   5
2    6   6
2    6   1001

.......

可见k1相同的属于一组，以下的属于同一组：

[(k1, k21), v1]

[(k1, k22), v2]

[(k1, k23), v3]

这就是group函数的作用。

注意：

    public static class Reduce extends Reducer<DataPair, IntWritable, DataPair, IntWritable>{
        @Override
        protected void reduce(
                DataPair key,
                Iterable<IntWritable> value,
                Reducer<DataPair, IntWritable, DataPair, IntWritable>.Context context)
                throws IOException, InterruptedException {
            //context.write(key, new IntWritable(1));
            int test = 1000; //for test
            for(IntWritable i : value){ //只迭代了value
                context.write(key, i);
            }
            System.out.println(++test); //for test
            context.write(key, new IntWritable(test)); //for test
        }
    }

此处只迭代了value，可从输出来看，for循环中key的值也发生了变化!?

不明白为什么，若哪位知道原因欢迎留言，谢谢。

古巴与八股

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hadoop 二次排序 group函数的作用的说明

hadoop mapreduce作业通过组合key实现二次排序的过程中，只要实现组合key的类就可以了。mapreduce框架本身会基于key对输出进行排序。而partion函数只为了是实现数据规模较大时，对map的输出实现分区。为启动多个reduce任务做准备。group函数也是可有可无的。group函数的作用是对key进行分组，例如对于map的结果：[(k1, k21), v1
复制链接

扫一扫

专栏目录