hadoop 二次排序和一个java实现_hadoop二次排序java代码-CSDN博客

本文链接：https://blog.csdn.net/kdb_viewer/article/details/82745361

需要二次排序的原因：mapreduce架构自动对映射器生成的键进行排序，即归约器启动之前，所有键是有序的，但是值是随机的，二次排序指的是对值进行排序。归约器输入形如： $(key, list(value)), list(value) = (V_{1}, V_{2},...,V_{n})$ ，即一个key对应多个值，这些值是无序的，排序后得到有序的值，如下：

$SORT(V_{1},V_{2},...,V_{n}) = (S_{1},S_{2},...,S_{n})$

$list(value) = (S_{1},S_{2},...,S_{n})$

其中，S按照升序或者降序排列

归约器对于二次排序的两种解决方案：

1.让归约器读取和缓存给定键的所有值，完成归约器中排序，特点是不可伸缩，依赖归约器内存

2.使用mapreduce框架对归约器值排序，方法是创建组合键，例如，A是键，B和C是值，选择B作为次键，这样A和B作为组合键，day作为值，将排序交给MapReduce框架完成，这样不用在内存中排序，是可伸缩的方案

定制插件：

1.分区器：根据映射器的输出键决定将哪个映射器的输出发送到哪个归约器，其本质是利用一致性hash算法

2.比较器：按照自然键对一个归约器中的数据分组，代码如下：

以year，month，temperature为例，MapReduce框架对于二次排序整体的处理流程是：

1.映射器创建 $(K,V)$ 对，其中K是组合键 $(year,month,temperature)$ ，V是temperature的值，组合键的 $(year,month)$ 部分是自然键

2.通过分区器插件，将所有自然键发送给同一个归约器

3.通过分组比较器，保证温度按照顺序到达归约器

显然，MapReduce框架完成了排序，而不用在内存中操作

代码如下，除了mapper，reducer，主作业流程以外，还有其余3个文件，一个分区器、一个比较器、一个中间件类：

DateTemperatureGroupingComparator.java分组比较器：

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class DateTemperatureGroupingComparator extends WritableComparator {
    public DateTemperatureGroupingComparator() {
        /* 调用父类构造函数 */
        super(DateTemperaturePair.class, true);
    }
    @Override
    /* 决定输出键和归约器的对应关系，保证相同键发送到同一个归约器 */
    public int compare(WritableComparable a, WritableComparable b) {
        DateTemperaturePair pair1 = (DateTemperaturePair) a;
        DateTemperaturePair pair2 = (DateTemperaturePair) b;
        return pair1.getYearMonth().compareTo(pair2.getYearMonth());
    }
}

DateTemperaturePartitioner.java分区器：

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class DateTemperaturePartitioner extends
    Partitioner<DateTemperaturePair, Text> {
    @Override
    /* 入参是mapper的输出键和输出值的类型，是一个String类的内置hash算法 */
    public int getPartition(DateTemperaturePair dataTemperaturePair, Text text,
        int i) {
        return Math.abs(dataTemperaturePair.getYearMonth().hashCode() % i);
    }
}

DateTemperaturePair.java中间键类

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class DateTemperaturePair
    /* java不支持多重继承，使用implements继承接口，不同接口之间用逗号隔开 */
    implements Writable, WritableComparable<DateTemperaturePair> {
    private String yearMonth;   //自然键
    private String day;
    protected Integer temperature;  //次键

    /* 用这个方法指出如何对DateTemperaturePair排序 */
    public int compareTo(DateTemperaturePair o) {
        /* 调用String的字符串比较方法compareTo */
        int compareValue = this.yearMonth.compareTo(o.getYearMonth());
        if (compareValue == 0) {
            compareValue = temperature.compareTo(o.getTemperature());
        }
        /* 这样实现降序排列 */
        return -1 * compareValue;
    }

    /* DataOutput用于将java基本类型转换成二进制字符流 */
    public void write(DataOutput dataOutput) throws IOException {
        Text.writeString(dataOutput, yearMonth);
        dataOutput.writeInt(temperature);
    }

    public void readFields(DataInput dataInput) throws IOException {
        this.yearMonth = Text.readString(dataInput);
        this.temperature = dataInput.readInt();
    }

    @Override
    public String toString() {
        return yearMonth.toString();
    }

    public String getYearMonth() {
        return yearMonth;
    }

    public void setYearMonth(String text) {
        this.yearMonth = text;
    }

    public void setDay(String day) {
        this.day = day;
    }

    public Integer getTemperature() {
        return temperature;
    }

    public void setTemperature(Integer temperature) {
        this.temperature = temperature;
    }
}

SecondarySortingMapper.java映射器：

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/* 注意输出键的类型是DateTemperaturePair，是自定义的组合键 */
public class SecondarySortingMapper extends
        Mapper<LongWritable, Text, DateTemperaturePair, IntWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
        String[] tokens = value.toString().split(",");
        String yearMonth = tokens[0] + "-" + tokens[1];
        String day = tokens[2];
        int temperature = Integer.parseInt(tokens[3]);
        DateTemperaturePair reduceKey = new DateTemperaturePair();
        reduceKey.setYearMonth(yearMonth);
        reduceKey.setDay(day);
        reduceKey.setTemperature(temperature);
        context.write(reduceKey, new IntWritable(temperature));
    }
}

SecondarySortingReducer.java归约器：

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/* 输入键和输出键的类型都是自定义的中间键 */
public class SecondarySortingReducer extends
        Reducer<DateTemperaturePair, IntWritable, DateTemperaturePair, Text> {
    @Override
    protected void reduce(DateTemperaturePair key,
            Iterable<IntWritable> values, Context context) throws IOException,
            InterruptedException {
        /* java的字符串变量，在修改场景下运行速度和效率比string高 */
        StringBuilder sortedTemperatureList = new StringBuilder();
        for (IntWritable temperature : values) {
            sortedTemperatureList.append(temperature);
            sortedTemperatureList.append(",");
        }
        sortedTemperatureList.deleteCharAt(sortedTemperatureList.length() - 1);
        context.write(key, new Text(sortedTemperatureList.toString()));
    }
}

SecondarySort.java主作业流程：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class SecondarySort extends Configured implements Tool {
    public int run(String[] args) throws Exception {

        /* 设置作业，指导hadoop获取jar包 */
        Job job = new Job();
        job.setJarByClass(SecondarySort.class);
        job.setJobName("SecondarySort");

        /* 获取input路径和output路径 */
        Path inputPath = new Path(args[0]);
        Path outputPath = new Path(args[1]);
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        /* 设置mapper的输出键和输出值 */
        job.setMapOutputKeyClass(DateTemperaturePair.class);
        job.setMapOutputValueClass(IntWritable.class);
       
        /* 设置reducer的输出键和输出值 */
        job.setOutputKeyClass(DateTemperaturePair.class);
        job.setOutputValueClass(IntWritable.class);
       
        /* 指定要使用的mapper，reducer，分区器，比较器 */
        job.setMapperClass(SecondarySortingMapper.class);
        job.setReducerClass(SecondarySortingReducer.class);
        job.setPartitionerClass(DateTemperaturePartitioner.class);
        job.setGroupingComparatorClass(DateTemperatureGroupingComparator.class);

        boolean status = job.waitForCompletion(true);
        return status ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
             throw new IllegalArgumentException(
                "!!!!!!!!!!!!!! Usage!!!!!!!!!!!!!!: SecondarySort"
                + "<input-path> <output-path>");
        }
        int returnStatus = ToolRunner.run(new SecondarySort(), args);
        System.exit(returnStatus);
    }
}

输入文件：

[root@master ~]# hdfs dfs -cat /sample_input.txt
2000,12,04,10
2000,11,01,20
2000,12,02,-20
2000,11,02,30
2000,11,24,-40
2012,12,21,30
2012,12,22,-20
2012,12,23,60
2012,12,24,70
2012,12,25,10
2013,01,22,80
2013,01,23,90
2013,01,24,70
2013,01,20,-10

运行作业命令：

hadoop jar SecondarySort.jar SecondarySort /sample_input.txt output

作业运行结果如下，可以看到已经按照temperature字段降序排序了：

[root@master ~]# hdfs dfs -cat output/*
2013-01	90,80,70,-10
2012-12	70,60,30,10,-20
2000-12	10,-20
2000-11	30,20,-40