MapRduce特性：全排序

最新推荐文章于 2022-10-03 15:52:52 发布

柯南爱上指针

最新推荐文章于 2022-10-03 15:52:52 发布

阅读量466

点赞数

分类专栏： hadoop基础文章标签： MapRduce特性：全排序

本文链接：https://blog.csdn.net/qq_35468937/article/details/80510432

版权

hadoop基础专栏收录该内容

4 篇文章 0 订阅

订阅专栏

现在在学习hadoop，尽量把自己学到的知识分享出来，监督自己的学习！

在hadoop权威指南中说到：排序是MapReduce的核心技术。因此可以知道排序的分量。

一、如何产生一个全排序文件？

1、只使用一个reduce（也就是只用一个分区：a single partition）,显然不靠谱，面对大文件的时候，失去了mapreduce的优势。

2、自己定义分区函数，对数据进行分区。

3、采用hadoop中的采样机制。

针对第二种和第三种，举一个例子讲一下。

二、自定义分区函数，进行全排序

举一个例子：要实现统计1920年-2020年100年每年温度的最高值，并且按年份从小到大排列。

1、思路如下图：

上图为了内容的简洁性，就没有将map中的combiner过程写出来。

2、附上partitioner上的代码

package cn.hbmy.hdfs.mr.maxtemperature;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/**
 *  自定义分区就是为了防止数据倾斜
 *  也就是说：防止所有的数据都进入了一个reducer里面去了
 *  解决数据倾斜的两种方式:
 *  1、自定义分区类，
 *  2、重新制定key值
 * */

//注意：map中出来的数据是IntWritable格式的数据，我们这儿的数据的格式也应该是IntWritable的格式的数据
//                                         数据的格式是IntWritable
public class TemPartitioner extends Partitioner<IntWritable,IntWritable>{
//                                                               这个参数就是第几部分的意思
    public int getPartition(IntWritable year, IntWritable intWritable, int parts) {
        //System.out.println("This is AllSortTemPartitioner");
        int y = year.get() - 1920;
        if (y<33){
            return 0;
        }else if(y>=33&&y<66){
            return 1;
        }else {
            return 2;
        }
    }
}

注意：上述代码主要是100年分成了三个区间，第一个区间寄放1920-1953年的数据,第二个区间寄放1953-1986年的数据,第三个区间寄放1986-2020数据，但是由于数据量分布不均匀，比如2010-2015年的数据量占总数据量的80%,那么根据上面的代码，就会产生数据倾斜，使得大量的数据进入第三区间，导致reduce3负载压力大，产生数据倾斜，为此，推荐下面的方法。

三、采用hadoop中的采样机制

为了解决式样书问题的关键点就在于如何均匀的划分各个区间。理想的情况下，各个区间的所含的记录数应该大致上相等，使得作业的总体的执行之间不会受个别reducer的限制。上文的数据，大致分布如下：

只要对这些数据采样，了解了数据的大致上的分布情况，这样就可以均匀的划分数据了。hadoop提供了采样器，只要使用就好了。

APP代码：

package cn.hbmy.hdfs.mr.allsorttemperature;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;

/**
 * 注意：这儿的代码的顺序，否则容易出错，单步调试可以看见结果
 * */
public class AllSortTemWCApp {
    public  static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
//        conf.set("fs.defaultFS","file:///");
        Job job = Job.getInstance(conf);
        //设置job的各种属性
        job.setJobName("AllSortTemWCApp");                         //作业名称
        job.setJarByClass(AllSortTemWCApp.class);                  //搜索类
//      文本输入格式偏移量是LongWritable，value是文本是一行
        job.setInputFormatClass(SequenceFileInputFormat.class);    //设置输入格式    这儿采用的是序列文件的输入格式
        FileInputFormat.addInputPath(job,new Path(args[0]));       //添加输入路径
        FileOutputFormat.setOutputPath(job,new Path(args[1]));     //设置输出路径

        job.setMapperClass(AllSortTemWCMapper.class);
        job.setReducerClass(AllSortTemWCReduce.class);
//        job.setPartitionerClass(AllSortTemPartitioner.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);

//      设置全排列分区
        job.setPartitionerClass(TotalOrderPartitioner.class);
//      设置采样器
        InputSampler.Sampler<IntWritable,IntWritable> sampler =
                new InputSampler.RandomSampler<IntWritable, IntWritable>(0.1,1000,3);
        job.setNumReduceTasks(3);                            //reduce的个数，一个输出结果   设置三个reduce个数
        //将sample数据写入分区文件

        /**   序列文件生成的内容，其中两个值就只他认为的两个比较合理的区间
         * [root@mini1 ~]# hdfs dfs -text /upload/wangwei/temperaturepartitionfile
           18/05/29 17:20:01 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
           18/05/29 17:20:01 INFO compress.CodecPool: Got brand-new decompressor [.deflate]
           2001    (null)
           2034    (null)
         * */
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("/upload/wangwei/temperaturepartitionfile"));
        InputSampler.writePartitionFile(job,sampler);


        job.waitForCompletion(true);
    }
}

map代码

package cn.hbmy.hdfs.mr.allsorttemperature;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class AllSortTemWCMapper extends Mapper <IntWritable,IntWritable,IntWritable,IntWritable>{
    //前面的两个是输入的kv，后面两个是输出的kv

    //v表示一行文本
    @Override
    protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {
        context.write(key,value);
    }
}

reduce代码

package cn.hbmy.hdfs.mr.allsorttemperature;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * Reducer
 * */
//                                        map过来的数据格式          reduce输出的数据的格式
public class AllSortTemWCReduce extends Reducer<IntWritable,IntWritable,IntWritable,IntWritable> {
    @Override
    protected void reduce(IntWritable year, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //这儿我们需要做最大值统计
        int maxTem = Integer.MIN_VALUE;
        for (IntWritable v :values){
            maxTem = maxTem>v.get() ? maxTem:v.get();
        }
        context.write(year,new IntWritable(maxTem));
    }
}