hadoop中MapReduce的sort(部分排序,完全排序,二次排序)

最新推荐文章于 2021-06-20 13:56:35 发布

Marlboro_2

最新推荐文章于 2021-06-20 13:56:35 发布

阅读量2.6k

点赞数 1

分类专栏： hadoop

本文链接：https://blog.csdn.net/Marlboro_2/article/details/82084266

版权

hadoop 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

1.部分排序

MapReduce默认就是在每个分区里进行排序

2.完全排序

在所有的分区中，整体有序

1)使用一个reduce
2)自定义分区函数

不同的key进入的到不同的分区之中,在每个分区中自动排序,实现完全分区..



import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class PassPartition extends Partitioner<Text, IntWritable> {

    @Override
    public int getPartition(Text text, IntWritable intWritable, int numPartitions) {

        String key = text.toString();
        if (key.compareTo("xxx") < 0) {
            return 0;
        }
        if (key.compareTo("aaaa") < 0) {
            return 1;
        } else return 2;
    }
}

3)采样 //对于纯文本数据支持不友好,纯文本的输入输出格式建议使用KeyValueTextInputFormat
//1、设置分区类TotalOrderPartition(MR中存在此类 )
//2、初始化采样器 => InputSampler.RandomSampler<Text,Text> sampler = new InputSampler.RandomSampler<Text,Text>(0.01,10);

SplitSampler

IntervalSampler

//3、设置采样数据地址 => TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("D:/"));
//4、写入采样数据 => InputSampler.writePartitionFile(job,sampler);
//5、注意1-4步必须写在配置文件之后，job执行之前

1.随机采样
比较耗费资源,浪费性能

2.切片采样

3. 间隔采样 :性能最好



import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;

public class PassApp {
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "file:///");

        FileSystem fs = FileSystem.get(conf);

        //通过配置文件初始化job
        Job job = Job.getInstance(conf);

        //设置job名称
        job.setJobName("word count");

        //job入口函数类
        job.setJarByClass(PassApp.class);

        //设置mapper类
        job.setMapperClass(PassMapper.class);

        //设置reducer类
        job.setReducerClass(PassReducer.class);


        //设置全排序采样类TotalOrderPartitioner
        job.setPartitionerClass(TotalOrderPartitioner.class);


        //设置map的输出k-v类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //设置reduce的输出k-v类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //FileInputFormat.setMaxInputSplitSize(job,10);
        //FileInputFormat.setMinInputSplitSize(job,10);

        job.setInputFormatClass(KeyValueTextInputFormat.class);

        //设置输入路径
        FileInputFormat.addInputPath(job, new Path("D:/wc/out"));

        //设置输出路径
        FileOutputFormat.setOutputPath(job, new Path("D:/wc/out4"));

        if(fs.exists(new Path("D:/wc/out4"))){
            fs.delete(new Path("D:/wc/out4"),true);
        }

        //设置三个reduce
        job.setNumReduceTasks(3);


        /**
         * 随机采样，比较浪费性能，耗费资源
         * @param freq 每个key被选择的概率 ，大于采样数(2) / 所有key数量(100)
         * @param numSamples 所有切片中需要选择的key数量
         */
        //设置采样器类型
        InputSampler.RandomSampler<Text,Text> sampler = new InputSampler.RandomSampler<Text,Text>(0.001,8800);

        //InputSampler.SplitSampler<Text,Text> sampler = new InputSampler.SplitSampler<Text,Text>(10,3);

        //InputSampler.IntervalSampler<Text,Text> sampler = new InputSampler.IntervalSampler<Text,Text>(0.001);

        //设置采样数据地址
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("D:/wc/par/"));

        //写入采样数据
        InputSampler.writePartitionFile(job,sampler);

        //执行job
        boolean b = job.waitForCompletion(true);
    }
}

3.二次排序

在MapReduce完成后,在对key排序的基础上,再对value进行排序

以年度气温最高统计

1901 :10 20 30 50 40

1901:30 20 10 11 -8

对年份进行排序完成后,对气温再进行一个排序

实现方法:

1.自定义key,使年份_气温变成一个key,自定义comkey 实现WritableComparable接口,实现自定义序列化和比较器(自定义排序算法)



import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class CompKey implements WritableComparable<CompKey> {

    private String year;
    private int temp;

    //定义排序规则
    public int compareTo(CompKey o) {

        String oyear = o.getYear();//第一个
        String tyear = this.getYear();
        int otemp = o.getTemp();
        int ttemp = this.getTemp();

        //如果参数year 和现在的year相同，则比较temp的大小
        if (tyear.equals(oyear)) {
            return otemp - ttemp;
        }
        //不同，返回两个year的比较值
        return tyear.compareTo(oyear);
    }

    public void write(DataOutput out) throws IOException {
        out.writeUTF(year);
        out.writeInt(temp);
    }

    public void readFields(DataInput in) throws IOException {
        this.setYear(in.readUTF());
        this.setTemp(in.readInt());

    }


    @Override
    public String toString() {
        return "CompKey{" +
                "year='" + year + '\'' +
                ", temp=" + temp +
                '}';
    }

    public CompKey(String year, int temp) {
        this.year = year;
        this.temp = temp;
    }

    public CompKey() {
    }

    public String getYear() {
        return year;
    }

    public void setYear(String year) {
        this.year = year;
    }

    public int getTemp() {
        return temp;
    }

    public void setTemp(int temp) {
        this.temp = temp;
    }
}

2.自定义分组对比器,将所有指定的key变成一个key,也就是说1920 30 ,1920 40这两个不同key识别成不同的key,这个分组对比器是在reduce端,重写WritableComparator中的MyGroupComparator和compar



import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * reduce端 分组对比器，自定义key业务逻辑，将1902 20 和1902 30 识别为一个key
 */
public class MyGroupComparator extends WritableComparator {

    //必须写，创建实例必须写true
    protected MyGroupComparator() {

        super(CompKey.class, true);
    }

    //比较算法
    //只要year相等则证明两个key相等
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        CompKey ck1 = (CompKey) a;
        CompKey ck2 = (CompKey) b;
        return ck1.getYear().compareTo(ck2.getYear());
    }
}

在Mainapp中注册分组对比器

job.setGroupingComparatorClass(MyGroupComparator.class);

Marlboro_2

关注

1
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录