MapReduce--自定义全局Sort、分区Sort

XK&RM

于 2020-12-09 14:49:06 发布

阅读量196

点赞数 1

分类专栏： Hadoop 文章标签： mapreduce hadoop java

本文链接：https://blog.csdn.net/qq_41301707/article/details/110925062

版权

Hadoop 专栏收录该内容

27 篇文章 1 订阅

订阅专栏

MapReduce--自定义全局Sort、分区Sort

1 Sort

Sort发生在Map之后Reduce之前
数据经过MapRedcue处理，首先经过Map阶段,然后经过Shuffle阶段
在Shuffle阶段，会对数据进行分割(spilt)、分区(partitioner)、排序(sort)、合并(combine)、压缩(compress)、分组(group)之后输出到Reduce端
全局排序类似于sql中的order by,只有一个reduce输出，保证全局有序
分区/局部排序保证每个分区有序
二次排序是有多个排序条件

2 自定义 MapReduce 全排

2.1 需求

第一个字段：手机号
第二个字段：上行流量
第三个字段：下行流量
第四个字段：流量总和

按照流量之和进行排序

2.2 数据

13480253104	180	180	360
13502468823	7335	110349	117684
13560436666	1116	954	2070
13560439658	2034	5892	7926
13602846565	1938	2910	4848
13660577991	6960	690	7650
13719199419	240	0	240
13726230503	2481	24681	27162
13726238888	12481	44681	57162
13760778710	120	120	240
13826544101	264	0	264
13922314466	3008	3720	6728
13925057413	11058	48243	59301
13926251106	240	0	240
13926435656	132	1512	1644
15013685858	3659	3538	7197
15920133257	3156	2936	6092
15989002119	1938	180	2118
18211575961	1527	2106	3633
18320173382	9531	2412	11943
84138413	4116	1432	5548

2.3 Code

2.3.1 AllSortDriver Code

package com.xk.bigata.hadoop.mapreduce.sort;

import com.xk.bigata.hadoop.mapreduce.domain.AccessAllSortDomain;
import com.xk.bigata.hadoop.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class AllSortDriver {

    public static void main(String[] args) throws Exception {

        String input = "mapreduce-basic/data/sort.data";
        String output = "mapreduce-basic/out";

        // 1 创建 MapReduce job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        // 删除输出路径
        FileUtils.deleteFile(job.getConfiguration(), output);

        // 2 设置运行主类
        job.setJarByClass(AllSortDriver.class);

        // 3 设置Map和Reduce运行的类
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        // 4 设置Map 输出的 KEY 和 VALUE 数据类型
        job.setMapOutputKeyClass(AccessAllSortDomain.class);
        job.setMapOutputValueClass(Text.class);

        // 5 设置Reduce 输出 KEY 和 VALUE 数据类型
        job.setOutputKeyClass(AccessAllSortDomain.class);
        job.setOutputValueClass(NullWritable.class);

        // 6 设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(input));
        FileOutputFormat.setOutputPath(job, new Path(output));

        job.setNumReduceTasks(1);

        // 7 提交job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }

    public static class MyMapper extends Mapper<LongWritable, Text, AccessAllSortDomain, Text> {

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] spilts = value.toString().split("\t");
            String phone = spilts[0];
            long up = Long.parseLong(spilts[1]);
            long down = Long.parseLong(spilts[2]);
            long sum = Long.parseLong(spilts[3]);
            context.write(new AccessAllSortDomain(phone, up, down, sum), new Text(phone));
        }
    }

    public static class MyReducer extends Reducer<AccessAllSortDomain, Text, AccessAllSortDomain, NullWritable> {
        @Override
        protected void reduce(AccessAllSortDomain key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            for (Text value : values) {
                context.write(key, NullWritable.get());
            }
        }
    }
}

2.3.2 AccessAllSortDomain Code

package com.xk.bigata.hadoop.mapreduce.domain;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 自定义数据类型步骤：
 * 1. 实现WritableComparable/Writable接口
 * 2. 重写 write、readFields、compareTo方法
 * 3. 要有一个无参的构造器
 * 4. write和readFields的字段顺序要保持一致
 * 5. 输出结果：需要重写toString(非必选)
 */
public class AccessAllSortDomain implements WritableComparable<AccessAllSortDomain> {

    private String phone;

    private Long up;

    private Long down;

    private Long sum;

    public String getPhone() {
        return phone;
    }

    public void setPhone(String phone) {
        this.phone = phone;
    }

    public Long getUp() {
        return up;
    }

    public void setUp(Long up) {
        this.up = up;
    }

    public Long getDown() {
        return down;
    }

    public void setDown(Long down) {
        this.down = down;
    }

    public Long getSum() {
        return sum;
    }

    public void setSum(Long sum) {
        this.sum = sum;
    }

    public AccessAllSortDomain() {
    }

    public AccessAllSortDomain(String phone, Long up, Long down, Long sum) {
        this.phone = phone;
        this.up = up;
        this.down = down;
        this.sum = sum;
    }

    @Override
    public String toString() {
        return phone + '\t' +
                up + '\t' +
                down + '\t' +
                sum;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(phone);
        out.writeLong(up);
        out.writeLong(down);
        out.writeLong(sum);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        phone = in.readUTF();
        up = in.readLong();
        down = in.readLong();
        sum = in.readLong();
    }

    @Override
    public int compareTo(AccessAllSortDomain o) {
        return this.getSum() > o.getSum() ? -1 : 1;
    }
}

2.4 结果

13502468823	7335	110349	117684
13925057413	11058	48243	59301
13726238888	12481	44681	57162
13726230503	2481	24681	27162
18320173382	9531	2412	11943
13560439658	2034	5892	7926
13660577991	6960	690	7650
15013685858	3659	3538	7197
13922314466	3008	3720	6728
15920133257	3156	2936	6092
84138413	4116	1432	5548
13602846565	1938	2910	4848
18211575961	1527	2106	3633
15989002119	1938	180	2118
13560436666	1116	954	2070
13926435656	132	1512	1644
13480253104	180	180	360
13826544101	264	0	264
13926251106	240	0	240
13760778710	120	120	240
13719199419	240	0	240

AllSortDriver Code

3 自定义 MapReduce 分区内排序

3.1 需求

第一个字段：手机号
第二个字段：上行流量
第三个字段：下行流量
第四个字段：流量总和

手机号为13开头的一个分区
手机号为18开头的一个分区
其他一个分区
按照流量之和进行分区排序

3.2 数据

13480253104	180	180	360
13502468823	7335	110349	117684
13560436666	1116	954	2070
13560439658	2034	5892	7926
13602846565	1938	2910	4848
13660577991	6960	690	7650
13719199419	240	0	240
13726230503	2481	24681	27162
13726238888	12481	44681	57162
13760778710	120	120	240
13826544101	264	0	264
13922314466	3008	3720	6728
13925057413	11058	48243	59301
13926251106	240	0	240
13926435656	132	1512	1644
15013685858	3659	3538	7197
15920133257	3156	2936	6092
15989002119	1938	180	2118
18211575961	1527	2106	3633
18320173382	9531	2412	11943
84138413	4116	1432	5548

3.3 Code

3.3.1 PartitionSortDriver Code

package com.xk.bigata.hadoop.mapreduce.sort;

import com.xk.bigata.hadoop.mapreduce.domain.AccessAllSortDomain;
import com.xk.bigata.hadoop.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class PartitionSortDriver {

    public static void main(String[] args) throws Exception {

        String input = "mapreduce-basic/data/sort.data";
        String output = "mapreduce-basic/out";

        // 1 创建 MapReduce job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        // 设置自定义分区类
        job.setPartitionerClass(PhonePartitioner.class);

        // 删除输出路径
        FileUtils.deleteFile(job.getConfiguration(), output);

        // 2 设置运行主类
        job.setJarByClass(PartitionSortDriver.class);

        // 3 设置Map和Reduce运行的类
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        // 4 设置Map 输出的 KEY 和 VALUE 数据类型
        job.setMapOutputKeyClass(AccessAllSortDomain.class);
        job.setMapOutputValueClass(Text.class);

        // 5 设置Reduce 输出 KEY 和 VALUE 数据类型
        job.setOutputKeyClass(AccessAllSortDomain.class);
        job.setOutputValueClass(NullWritable.class);

        // 6 设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(input));
        FileOutputFormat.setOutputPath(job, new Path(output));

        // 设置 reduce task 个数
        job.setNumReduceTasks(3);

        // 7 提交job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }

    public static class MyMapper extends Mapper<LongWritable, Text, AccessAllSortDomain, Text> {

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] spilts = value.toString().split("\t");
            String phone = spilts[0];
            long up = Long.parseLong(spilts[1]);
            long down = Long.parseLong(spilts[2]);
            long sum = Long.parseLong(spilts[3]);
            context.write(new AccessAllSortDomain(phone, up, down, sum), new Text(phone));
        }
    }

    public static class MyReducer extends Reducer<AccessAllSortDomain, Text, AccessAllSortDomain, NullWritable> {
        @Override
        protected void reduce(AccessAllSortDomain key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            for (Text value : values) {
                context.write(key, NullWritable.get());
            }
        }
    }
}

3.3.2 PhonePartitioner Code

package com.xk.bigata.hadoop.mapreduce.sort;

import com.xk.bigata.hadoop.mapreduce.domain.AccessAllSortDomain;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class PhonePartitioner extends Partitioner<AccessAllSortDomain, Text> {

    @Override
    public int getPartition(AccessAllSortDomain accessAllSortDomain, Text text, int numPartitions) {
        String phone = accessAllSortDomain.getPhone();
        if (phone.startsWith("13")) {
            return 0;
        } else if (phone.startsWith("18")) {
            return 1;
        } else {
            return 2;
        }
    }
}

3.4 结果

3.4.1 part-r-00000

13502468823	7335	110349	117684
13925057413	11058	48243	59301
13726238888	12481	44681	57162
13726230503	2481	24681	27162
13560439658	2034	5892	7926
13660577991	6960	690	7650
13922314466	3008	3720	6728
13602846565	1938	2910	4848
13560436666	1116	954	2070
13926435656	132	1512	1644
13480253104	180	180	360
13826544101	264	0	264
13719199419	240	0	240
13926251106	240	0	240
13760778710	120	120	240

3.4.2 part-r-00001

18320173382	9531	2412	11943
18211575961	1527	2106	3633

3.4.3 part-r-00002

15013685858	3659	3538	7197
15920133257	3156	2936	6092
84138413	4116	1432	5548
15989002119	1938	180	2118

PartitionSortDriver Code

XK&RM

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
MapReduce--自定义全局Sort、分区Sort

MapReduce--自定义全局Sort、分区Sort1 SortSort发生在Map之后Reduce之前数据经过MapRedcue处理，首先经过Map阶段,然后经过Shuffle阶段在Shuffle阶段，会对数据进行分割(spilt)、分区(partitioner)、排序(sort)、合并(combine)、压缩(compress)、分组(group)之后输出到Reduce端2 自定义 MapReduce 全排2.1 需求...
复制链接

扫一扫

专栏目录