手写MR之GroupComparator(Java)

数据源

orderid			 pid	cast

10000001	Pdt_01	222.8
10000002	Pdt_06	722.4
10000001	Pdt_02	222.8
10000001	Pdt_05	25.8
10000003	Pdt_01	232.8
10000003	Pdt_01	33.8
10000002	Pdt_04	122.4
10000002	Pdt_03	522.8

期望

将这些数据按照orderid分组,但是输出到一个文件里!!
将每一个相同orderid中的花费(cast)最高的那条数据取出!

10000001	Pdt_01	222.8
10000002	Pdt_06	722.4
10000003	Pdt_01	232.8

Bean

package com.zhengkw.groupingcomparator;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * @ClassName:OrderBean
 * @author: zhengkw
 * @description:
 * @date: 20/02/27下午 1:51
 * @version:1.0
 * @since: jdk 1.8
 */
public class OrderBean implements WritableComparable<OrderBean> {
    private int orderId;
    private String product_id;
    private double money;
    private int result = 0;

    public OrderBean() {
    }

    public String getProduct_id() {
        return product_id;
    }

    public double getMoney() {
        return money;
    }

    public void setOrderId(int orderId) {
        this.orderId = orderId;
    }

    public int getOrderId() {
        return orderId;
    }

    public void setProduct_id(String product_id) {
        this.product_id = product_id;
    }

    public void setMoney(double money) {
        this.money = money;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(orderId);
        out.writeUTF(product_id);
        out.writeDouble(money);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.orderId = in.readInt();
        this.product_id = in.readUTF();
        this.money = in.readDouble();
    }


    /**
     * @param o
     * @descrption: 比较对象的N个属性,则一般称作N次排序
     * 所有订单数据按照id升序排序,
     * 如果id相同再按照金额降序排序,发送到Reduce
     * @return: int
     * @date: 20/02/27 下午 1:57
     * @author: zhengkw
     */
    @Override
    public int compareTo(OrderBean o) {
        //二次排序
        if (this.orderId > o.orderId) {
            result = 1;
        } else if (this.orderId < o.orderId) {
            result = -1;
        } else {
            //降序
            result = this.money > o.money ? -1 : 1;
        }


        return result;
    }

    @Override
    public String toString() {
        return orderId +
                "\t" + product_id +
                "\t" + money;
    }
}

Mapper

package com.zhengkw.groupingcomparator;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @ClassName:OrderMapper
 * @author: zhengkw
 * @description: (1)利用“订单id和成交金额”作为key,
 * 可以将Map阶段读取到的所有订单数据按照id升序排序,
 * 如果id相同再按照金额降序排序,发送到Reduce。
 * (2)在Reduce端利用groupingComparator
 * 将订单id相同的kv聚合成组,
 * 然后取第一个即是该订单中最贵商品
 * @date: 20/02/27下午 1:48
 * @version:1.0
 * @since: jdk 1.8
 */
public class OrderMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable> {
    NullWritable v = NullWritable.get();
    OrderBean k = new OrderBean();


    /**
     * @param key     偏移量
     * @param value   一行数据
     * @param context 上下文
     * @descrption:处理数据封装
     * @return: void
     * @date: 20/02/27 下午 2:15
     * @author: zhengkw
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //10000001	Pdt_02	222.8
        String line = value.toString();
        String[] words = line.trim().split("\t");

        //orderid
        k.setOrderId(Integer.parseInt(words[0]));
        //pid
        k.setProduct_id(words[1]);
        //money
        k.setMoney(Double.parseDouble(words[2]));
        context.write(k, v);
    }
}

Reduce

package com.zhengkw.groupingcomparator;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @ClassName:OrderReducer
 * @author: zhengkw
 * @description:
 * @date: 20/02/27下午 2:12
 * @version:1.0
 * @since: jdk 1.8
 */
public class OrderReducer extends Reducer<OrderBean, NullWritable, NullWritable,OrderBean> {
    @Override
    protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        context.write(NullWritable.get(),key);
    }
}

groupcomparator

package com.zhengkw.groupingcomparator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * @ClassName: OrderGroupComparator
 * @author: zhengkw
 * @description: 在Reduce端利用groupingComparator
 * 将订单id相同的kv聚合成组,
 * 然后取第一个即是该订单中最贵商品
 * @date: 20/02/27下午 2:25
 * @version:1.0
 * @since: jdk 1.8
 */
public class OrderGroupComparator extends WritableComparator {
    private int result;

    public OrderGroupComparator() {
        super(OrderBean.class, true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        //将orderid升序排序
        OrderBean aOrderBean = (OrderBean) a;
        OrderBean bOrderBean = (OrderBean) b;
//按照orderid作为key分组只会输出每组的第一条数据
        if (aOrderBean.getOrderId() > bOrderBean.getOrderId()) {
            result = 1;
        } else if (aOrderBean.getOrderId() < bOrderBean.getOrderId()) {
            result = -1;
        } else {
            result = 0;
        }
        return result;
    }
}

Driver

package com.zhengkw.groupingcomparator;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;

import java.io.IOException;

/**
 * @ClassName:OrderDriver
 * @author: zhengkw
 * @description:
 * @date: 20/02/27下午 3:10
 * @version:1.0
 * @since: jdk 1.8
 */
public class OrderDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        BasicConfigurator.configure();

        // 输入路径
        Path inputPath = new Path("F:\\mrinput\\groupcomparator");
        // 输出路径
        Path outputPath = new Path("f:/output4");

        Configuration conf = new Configuration();

        //判断输出路径是否已经存在 存在则删除
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }
        // 创建job
        Job job = Job.getInstance(conf);
        //设置3个类
        job.setJarByClass(OrderDriver.class);
        job.setMapperClass(OrderMapper.class);
        job.setReducerClass(OrderReducer.class);
        job.setGroupingComparatorClass(OrderGroupComparator.class);

        //设置2个阶段的K V类
        job.setMapOutputKeyClass(OrderBean.class);
        job.setMapOutputValueClass(NullWritable.class);

        //指定最终输出的 KV 类
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(OrderBean.class);


        //设置输入输出路径
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        //提交job true打印job信息  获取返回值退出程序
        boolean result = job.waitForCompletion(true);

        System.exit(result ? 0 : 1);

    }
}

总结

  • GroupComparator是继承于WritableComparator,是在Reduce阶段调用!

  • 用于解决内容上按照某些字段进行分组,但是输出在同一个文件里。区别于Partitioner!!partitioner在溢写阶段调用。

  • 分组比较器只是为了分组,辅助排序后的数据分组!!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值