【MR】MapReduce 常见的排序

【MR】MapReduce 常见的排序

一,Hadoop默认的排序算法,只会针对key值进行排序,按照字典顺序排序。
直接上代码
Map端

package Hadoop.MR.sort;
import java.io.IOException;

/**
 * 文本排序-map分组,在Hadoop默认的排序算法中,只会针对key值进行排序
 * @author Young
 * created on 2017-6-30
 */
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class SortMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
            throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        String line = value.toString();//获取文本内容
        context.write(new Text(line), NullWritable.get());
    }

}

Reduce端

package Hadoop.MR.sort;
import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
/**
 * 文本排序-reduce端排序,在Hadoop默认的排序算法中,只会针对key值进行排序
 * @author Young
 * created on 2017-6-30
 */
import org.apache.hadoop.mapreduce.Reducer;

public class SortReducer extends Reducer<Text, NullWritable, Text, NullWritable> {

    @Override
    protected void reduce(Text k2, Iterable<NullWritable> v2,
            Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        context.write(k2, NullWritable.get());
    }

}

驱动程序

package Hadoop.MR.sort;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 排序驱动程序
 * @author Young
 * @version 创建时间:2017年6月30日上午9:31:50
 */
public class SortDriver extends Configured implements Tool {
    public int run(String[] arg0) throws Exception {
        if (arg0.length != 2){
            System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName());
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }
//      Configuration conf = new Configuration();
//      Job job = new Job(getConf(),"Max Temperture");
        Job job = Job.getInstance(getConf(), "Sort");
        job.setJarByClass(getClass());

        FileInputFormat.addInputPath(job, new Path(arg0[0]));
        FileOutputFormat.setOutputPath(job, new Path(arg0[1]));

        job.setMapperClass(SortMapper.class);
        job.setReducerClass(SortReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        return job.waitForCompletion(true)?0:1;
    }


    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub
        int exitCode = ToolRunner.run(new SortDriver(), args);
        System.exit(exitCode);
    }
}

排序前后
这里写图片描述

二,自定义,先根据第一列排序,若相同则根据第二列排序,
自定义Bean,

package Hadoop.MR.mysort;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;
/**
 * 实现WritableComparable接口,重写compareTo进行排序
 * @author Young
 * @version 创建时间:2017年6月30日上午9:54:14
 */
public class SortBean implements WritableComparable<SortBean> {
    private long firstNum;
    private long secondNum;
    public SortBean(){
    }
    public SortBean(long first,long second){
        this.firstNum=first;
        this.secondNum=second;
    }
    public void readFields(DataInput in) throws IOException {
        // TODO Auto-generated method stub
        this.firstNum=in.readLong();
        this.secondNum=in.readLong();
    }
    public void write(DataOutput out) throws IOException {
        // TODO Auto-generated method stub
        out.writeLong(firstNum);
        out.writeLong(secondNum);
    }
    public int compareTo(SortBean o) {
        // TODO Auto-generated method stub
        //返回1则交换,-1则不交换。
        if(this.firstNum==o.getFirstNum()){
            return this.secondNum>o.getSecondNum() ? 1:-1;
        }
        else{
            return this.firstNum>o.getFirstNum() ? 1:-1;
        }
    }
    @Override
    public String toString() {
        // TODO Auto-generated method stub
        return this.firstNum+" "+this.secondNum;
    }
    public long getFirstNum() {
        return firstNum;
    }
    public void setFirstNum(long firstNum) {
        this.firstNum = firstNum;
    }
    public long getSecondNum() {
        return secondNum;
    }
    public void setSecondNum(long secondNum) {
        this.secondNum = secondNum;
    }
}

Map端

package Hadoop.MR.mysort;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
 * 自定排序map端
 * @author Young
 * @version 创建时间:2017年6月30日上午10:20:12
 */
public class MySortMapper extends Mapper<LongWritable, Text, SortBean,NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, 

SortBean, NullWritable>.Context context)
            throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        String line = value.toString();
        String []num=line.split("\t");
        long firstNum=Long.parseLong(num[0]);
        long secondNum=Long.parseLong(num[1]);
        SortBean bean = new SortBean(firstNum,secondNum);
        context.write(bean, NullWritable.get());
    }
}

Reduce端

package Hadoop.MR.mysort;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
/**
 * 自定义排序reduce端
 * @author Young
 * @version 创建时间:2017年6月30日上午10:30:42
 */
public class MySortReducer extends Reducer<SortBean, NullWritable, SortBean, NullWritable> {
    @Override
    protected void reduce(SortBean k2, Iterable<NullWritable> v2,
            Reducer<SortBean, NullWritable, SortBean, NullWritable>.Context context)
            throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        context.write(k2, NullWritable.get());
    }

}

驱动程序

package Hadoop.MR.mysort;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


/**
 * 自定义排序驱动程序
 * @author Young
 * @version 创建时间:2017年6月30日上午10:38:23
 */
public class MySortDriver extends Configured implements Tool {
    public int run(String[] arg0) throws Exception {
        if (arg0.length != 2){
            System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName());
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }
//      Configuration conf = new Configuration();
//      Job job = new Job(getConf(),"Max Temperture");
        Job job = Job.getInstance(getConf(), "MySort");
        job.setJarByClass(getClass());

        FileInputFormat.addInputPath(job, new Path(arg0[0]));
        FileOutputFormat.setOutputPath(job, new Path(arg0[1]));

        job.setMapperClass(MySortMapper.class);
        job.setReducerClass(MySortReducer.class);

        job.setOutputKeyClass(SortBean.class);
        job.setOutputValueClass(NullWritable.class);

        return job.waitForCompletion(true)?0:1;
    }


    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub
        int exitCode = ToolRunner.run(new MySortDriver(), args);
        System.exit(exitCode);
    }
}

排序前后
这里写图片描述

三,求最值
Map端

package Hadoop.MR.max;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
 * 最大值Map阶段
 * @author Young
 * @version 创建时间:2017年9月5日下午3:47:02
 */
public class MaxMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {
    @Override
    protected void map(LongWritable key, Text value,
            Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
            throws IOException, InterruptedException {
        String lines[]=value.toString().split("\t");
        String account=lines[0];
        double income=Double.parseDouble(lines[1]);
        context.write(new Text(account), new DoubleWritable(income));
    }
}

Reduce端

package Hadoop.MR.max;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
 * 最大值Reduce阶段
 * @author Young
 * @version 创建时间:2017年9月5日下午4:15:29
 */
public class MaxReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {
    @Override
    protected void reduce(Text key, Iterable<DoubleWritable> value,
            Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)
            throws IOException, InterruptedException {
        double Max=Double.MIN_VALUE;
        for(DoubleWritable v:value){
            Max=Math.max(Max, v.get());
        }
        context.write(key, new DoubleWritable(Max));
    }
}

驱动程序

package Hadoop.MR.max;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
 * 最大值驱动类
 * @author Young
 * @version 创建时间:2017年9月5日下午4:17:32
 */
public class MaxDriver extends Configured implements Tool {

    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub
        int exitCode = ToolRunner.run(new MaxDriver(), args);
        System.exit(exitCode);
    }

    public int run(String[] arg0) throws Exception {
        if (arg0.length != 2){
            System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName());
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }
        Job job = Job.getInstance(getConf(), "Max");
        job.setJarByClass(getClass());
        FileInputFormat.addInputPath(job, new Path(arg0[0]));
        FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
        job.setMapperClass(MaxMapper.class);
        job.setReducerClass(MaxReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);
        return job.waitForCompletion(true)?0:1;
    }
}

排序前后
这里写图片描述

四,TopN,文件中包含不同的key,键值不唯一,取每个键值前三个最小值。
Map端

package Hadoop.MR.topn;

import java.io.IOException;

import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
 * 取前n名Map阶段
 * @author Young
 * @version 创建时间:2017年9月10日下午3:26:00
 */
public class TopNMapper extends Mapper<LongWritable,Text,Text ,DoubleWritable> {
    private Text k=new Text();
    private DoubleWritable v=new DoubleWritable();
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text ,DoubleWritable>.Context context)
            throws IOException, InterruptedException {
        String lines[]=value.toString().split("\t");
        String account=lines[0];
        double income=Double.parseDouble(lines[1]);
        v.set(income);
        k.set(account);
        context.write(k,v);
    }
}

Reduce端

package Hadoop.MR.topn;

import java.io.IOException;
import java.util.TreeSet;

import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
 * 取前n名reduce阶段
 * @author Young
 * @version 创建时间:2017年9月10日下午3:49:01
 */
public class TopNReducer extends Reducer<Text ,DoubleWritable,Text ,DoubleWritable> {
    private  TreeSet<Double> incomeTreeSet = new TreeSet<Double>();
    private  DoubleWritable v=new DoubleWritable();
    private  int n=3;
    @Override
    protected void reduce( Text key, Iterable<DoubleWritable> value,
            Reducer<Text ,DoubleWritable, Text ,DoubleWritable>.Context 

context)
            throws IOException, InterruptedException {
        // TODO Auto-generated method stub
           for (DoubleWritable val : value) {  
               incomeTreeSet.add(val.get());  
               if (incomeTreeSet.size() > n) {  
                   incomeTreeSet.remove(incomeTreeSet.first());  
               }  
           }  
           for (Double in: incomeTreeSet) {  
               v.set(in);  
               context.write(key, v);  
           } 
    }
}

驱动程序

package Hadoop.MR.topn;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 取前n名驱动程序
 * @author Young
 * @version 创建时间:2017年9月10日下午3:59:49
 */
public class TopNDriver extends Configured implements Tool {

    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub
        int exitCode = ToolRunner.run(new TopNDriver(), args);
        System.exit(exitCode);
    }

    public int run(String[] arg0) throws Exception {
         //TODO Auto-generated method stub
        if (arg0.length != 2){
            System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName());
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }
        Job job = Job.getInstance(getConf(), "TopN");
        job.setJarByClass(getClass());

        FileInputFormat.addInputPath(job, new Path(arg0[0]));
        FileOutputFormat.setOutputPath(job, new Path(arg0[1]));

        job.setMapperClass(TopNMapper.class);
        job.setReducerClass(TopNReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);

        return job.waitForCompletion(true)?0:1;
    }
}

结果
这里写图片描述
结果是有了,但是后来觉得这样会把Reduce给拖死,TreeSet的排序以及换位会消耗很多内存资源。想在Map就把排序做好,然后Reduce直接取前n就好了,不过没想到。有更好的算法的,请拍砖。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值