MR——实现二次排序

ps:

一、字符串比较用compareTo()时:

1,长度相同,从第一位开始比较,如果相同返回0,如果不同则马上返回这两个字符的ascii值的差值。

2,长度不同,直接返回长度差值。

二、Integer比较用compareTo()时:

1,对比数字时相同,返回0。

2,对比数字不同时,返回-1。

 

实现每个人最喜欢的电影topN,按照uid、分数进行排序

一、 RateBean

                             implements WritableComparable

 

重写的compareto方法,应该是在:排序的时候用到

package topn;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * @Description
 * @Author cqh <caoqingghai@1000phone.com>
 * @Version V1.0
 * @Since 1.0
 * @Date 2019/4/15 09:48
 */
public class RateBean implements WritableComparable<RateBean> {
    private String movie;
    private String rate;
    private String timeStamp;
    private String uid;
    public RateBean() {
    }

    public RateBean(String movie) {
        this.movie = movie;
    }

    public String getMovie() {
        return movie;
    }

    public void setMovie(String movie) {
        this.movie = movie;
    }

    public String getRate() {
        return rate;
    }

    public void setRate(String rate) {
        this.rate = rate;
    }

    public String getTimeStamp() {
        return timeStamp;
    }

    public void setTimeStamp(String timeStamp) {
        this.timeStamp = timeStamp;
    }

    public String getUid() {
        return uid;
    }

    public void setUid(String uid) {
        this.uid = uid;
    }

    @Override
    public int compareTo(RateBean o) {
        if(this.uid.equals(o.uid)){
            return -this.rate.compareTo(o.rate);
        }else {
            return this.uid.compareTo(o.uid);
        }

    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(movie);
        out.writeUTF(rate);
        out.writeUTF(timeStamp);
        out.writeUTF(uid);

    }

    @Override
    public String toString() {
        return movie +"\t" + rate + "\t" + uid;
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.movie = in.readUTF();
        this.rate = in.readUTF();
        this.timeStamp = in.readUTF();
        this.uid = in.readUTF();

    }
}

二、RatePartitioner

                                  extends Partitioner

map端在分区的时候,根据uid的值hash分区器的数量,来确定分区

package topn;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * @Description
 * @Author cqh <caoqingghai@1000phone.com>
 * @Version V1.0
 * @Since 1.0
 * @Date 2019/4/15 10:17
 */
public class RatePartitioner extends Partitioner<RateBean, NullWritable> {
    @Override
    public int getPartition(RateBean rateBean, NullWritable nullWritable, int numPartitions) {
        //rateBean.getUid().hashCode()&Integer.MAX_VALUE防止超出INT范围出现负数
        return rateBean.getUid().hashCode()&Integer.MAX_VALUE%numPartitions;
    }
}

 

三、RateGroupingComparable

                                                         extends WritableComparator

确保reduce端分组的时候,将uid相同的分到同一个组

没有这个的话,应该就是按照ratebean自己定义的那个comparto来确定规则,进行分组了。

 

package topn;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * @Description
 * @Author cqh <caoqingghai@1000phone.com>
 * @Version V1.0
 * @Since 1.0
 * @Date 2019/4/15 10:28
 */
public class RateGroupingComparable extends WritableComparator {
    public RateGroupingComparable() {
        super(RateBean.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        RateBean bean1 = (RateBean)a;
        RateBean bean2 = (RateBean)b;
        return bean1.getUid().compareTo(bean2.getUid());
    }
}

 

四、mapper

 

package topn;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.codehaus.jackson.map.ObjectMapper;


import java.io.IOException;

/**
 * @Description
 * @Author cqh <caoqingghai@1000phone.com>
 * @Version V1.0
 * @Since 1.0
 * @Date 2019/4/15 10:05
 */
public class RateMapper extends Mapper<LongWritable, Text,RateBean, NullWritable> {
    ObjectMapper objectMapper;

    @Override
    //setup方法仅调用一次,一般用来给一些变量赋值
    protected void setup(Context context) throws IOException, InterruptedException {
        objectMapper = new ObjectMapper();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        RateBean rateBean = objectMapper.readValue(value.toString(), RateBean.class);
        context.write(rateBean,NullWritable.get());
    }

    @Override
    //仅调用一次,maptask执行完成后调用一次,用于关闭一些资源
    protected void cleanup(Context context) throws IOException, InterruptedException {
        super.cleanup(context);
    }
}

 

五、reducer

package topn;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @Description
 * @Author cqh <caoqingghai@1000phone.com>
 * @Version V1.0
 * @Since 1.0
 * @Date 2019/4/15 10:23
 */
public class RateReducer extends Reducer<RateBean, NullWritable,RateBean, NullWritable> {
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        super.setup(context);
    }

    @Override
    protected void reduce(RateBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {

        int topN = context.getConfiguration().getInt("topN",4);
        int count = 0;
        for (NullWritable value:values){
            context.write(key,NullWritable.get());
            count++;
            if (count==topN){
                return;
            }
        }
    }

    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        super.cleanup(context);
    }
}

六、runner

package topn;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;
import wordcount.WordCountMapper;
import wordcount.WordCountReduce;

import java.io.IOException;

/**
 * @Description
 * @Author cqh <caoqingghai@1000phone.com>
 * @Version V1.0
 * @Since 1.0
 * @Date 2019/4/15 10:53
 */
public class RateRunner  {
    private static Logger logger = Logger.getLogger(RateRunner.class);
    public static void main(String[] args) {
        try {
            Configuration  conf =  new Configuration();
            //conf.set("topN","5");
            Job job = Job.getInstance(conf,"topN");
            logger.info("一切正常================================");
            job.setMapperClass(RateMapper.class);
            job.setReducerClass(RateReducer.class);
            /**
             * map端输出的数据要进行序列化,所以我们要告诉框架map端输出的数据类型
             */
            job.setMapOutputKeyClass(RateBean.class);
            job.setMapOutputValueClass(NullWritable.class);
            /**
             * reduce端要输出,所有也要指定数据类型
             */
            job.setOutputKeyClass(RateBean.class);
            job.setOutputValueClass(NullWritable.class);
            /**
             * 告诉框架用什么组件去读数据,普通的文本文件,就用TextInputFormat
             * 导入长包
             */
            job.setInputFormatClass(TextInputFormat.class);
            /**
             * 告诉这个组件去哪儿读数据
             * TextInputFormat有个父类FileInputFormat
             * 用父类去指定到哪儿去读数据
             * 输入路径是一个目录,该目录下如果有子目录需要进行设置递归遍历,否则会报错
             */
            FileInputFormat.addInputPath(job,new Path(args[0]));
            job.setPartitionerClass(RatePartitioner.class);
            job.setGroupingComparatorClass(RateGroupingComparable.class);
            FileSystem fs = FileSystem.get(conf);
            Path out = new Path(args[1]);
            if (fs.exists(out)){
                logger.info("目录存在,删除================================");
                fs.delete(out,true);
            }
            FileOutputFormat.setOutputPath(job,out);

            boolean res = job.waitForCompletion(true);
            System.exit(res?0:1);
        } catch (Exception e) {
            logger.error("获取job异常",e);
            e.printStackTrace();
        }
    }


}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值