数据源
orderid pid cast
10000001 Pdt_01 222.8
10000002 Pdt_06 722.4
10000001 Pdt_02 222.8
10000001 Pdt_05 25.8
10000003 Pdt_01 232.8
10000003 Pdt_01 33.8
10000002 Pdt_04 122.4
10000002 Pdt_03 522.8
期望
将这些数据按照orderid分组,但是输出到一个文件里!!
将每一个相同orderid中的花费(cast)最高的那条数据取出!
10000001 Pdt_01 222.8
10000002 Pdt_06 722.4
10000003 Pdt_01 232.8
Bean
package com.zhengkw.groupingcomparator;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @ClassName:OrderBean
* @author: zhengkw
* @description:
* @date: 20/02/27下午 1:51
* @version:1.0
* @since: jdk 1.8
*/
public class OrderBean implements WritableComparable<OrderBean> {
private int orderId;
private String product_id;
private double money;
private int result = 0;
public OrderBean() {
}
public String getProduct_id() {
return product_id;
}
public double getMoney() {
return money;
}
public void setOrderId(int orderId) {
this.orderId = orderId;
}
public int getOrderId() {
return orderId;
}
public void setProduct_id(String product_id) {
this.product_id = product_id;
}
public void setMoney(double money) {
this.money = money;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(orderId);
out.writeUTF(product_id);
out.writeDouble(money);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readInt();
this.product_id = in.readUTF();
this.money = in.readDouble();
}
/**
* @param o
* @descrption: 比较对象的N个属性,则一般称作N次排序
* 所有订单数据按照id升序排序,
* 如果id相同再按照金额降序排序,发送到Reduce
* @return: int
* @date: 20/02/27 下午 1:57
* @author: zhengkw
*/
@Override
public int compareTo(OrderBean o) {
//二次排序
if (this.orderId > o.orderId) {
result = 1;
} else if (this.orderId < o.orderId) {
result = -1;
} else {
//降序
result = this.money > o.money ? -1 : 1;
}
return result;
}
@Override
public String toString() {
return orderId +
"\t" + product_id +
"\t" + money;
}
}
Mapper
package com.zhengkw.groupingcomparator;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @ClassName:OrderMapper
* @author: zhengkw
* @description: (1)利用“订单id和成交金额”作为key,
* 可以将Map阶段读取到的所有订单数据按照id升序排序,
* 如果id相同再按照金额降序排序,发送到Reduce。
* (2)在Reduce端利用groupingComparator
* 将订单id相同的kv聚合成组,
* 然后取第一个即是该订单中最贵商品
* @date: 20/02/27下午 1:48
* @version:1.0
* @since: jdk 1.8
*/
public class OrderMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable> {
NullWritable v = NullWritable.get();
OrderBean k = new OrderBean();
/**
* @param key 偏移量
* @param value 一行数据
* @param context 上下文
* @descrption:处理数据封装
* @return: void
* @date: 20/02/27 下午 2:15
* @author: zhengkw
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//10000001 Pdt_02 222.8
String line = value.toString();
String[] words = line.trim().split("\t");
//orderid
k.setOrderId(Integer.parseInt(words[0]));
//pid
k.setProduct_id(words[1]);
//money
k.setMoney(Double.parseDouble(words[2]));
context.write(k, v);
}
}
Reduce
package com.zhengkw.groupingcomparator;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @ClassName:OrderReducer
* @author: zhengkw
* @description:
* @date: 20/02/27下午 2:12
* @version:1.0
* @since: jdk 1.8
*/
public class OrderReducer extends Reducer<OrderBean, NullWritable, NullWritable,OrderBean> {
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(NullWritable.get(),key);
}
}
groupcomparator
package com.zhengkw.groupingcomparator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* @ClassName: OrderGroupComparator
* @author: zhengkw
* @description: 在Reduce端利用groupingComparator
* 将订单id相同的kv聚合成组,
* 然后取第一个即是该订单中最贵商品
* @date: 20/02/27下午 2:25
* @version:1.0
* @since: jdk 1.8
*/
public class OrderGroupComparator extends WritableComparator {
private int result;
public OrderGroupComparator() {
super(OrderBean.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
//将orderid升序排序
OrderBean aOrderBean = (OrderBean) a;
OrderBean bOrderBean = (OrderBean) b;
//按照orderid作为key分组只会输出每组的第一条数据
if (aOrderBean.getOrderId() > bOrderBean.getOrderId()) {
result = 1;
} else if (aOrderBean.getOrderId() < bOrderBean.getOrderId()) {
result = -1;
} else {
result = 0;
}
return result;
}
}
Driver
package com.zhengkw.groupingcomparator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;
import java.io.IOException;
/**
* @ClassName:OrderDriver
* @author: zhengkw
* @description:
* @date: 20/02/27下午 3:10
* @version:1.0
* @since: jdk 1.8
*/
public class OrderDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
BasicConfigurator.configure();
// 输入路径
Path inputPath = new Path("F:\\mrinput\\groupcomparator");
// 输出路径
Path outputPath = new Path("f:/output4");
Configuration conf = new Configuration();
//判断输出路径是否已经存在 存在则删除
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
// 创建job
Job job = Job.getInstance(conf);
//设置3个类
job.setJarByClass(OrderDriver.class);
job.setMapperClass(OrderMapper.class);
job.setReducerClass(OrderReducer.class);
job.setGroupingComparatorClass(OrderGroupComparator.class);
//设置2个阶段的K V类
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
//指定最终输出的 KV 类
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(OrderBean.class);
//设置输入输出路径
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
//提交job true打印job信息 获取返回值退出程序
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
总结
-
GroupComparator是继承于WritableComparator,是在Reduce阶段调用!
-
用于解决内容上按照某些字段进行分组,但是输出在同一个文件里。区别于Partitioner!!partitioner在溢写阶段调用。
-
分组比较器只是为了分组,辅助排序后的数据分组!!