1、首先要了解实现hadoop的排序需要用到的接口
a、writableComparable 用于序列化和排序,主要用于map()前后,缓冲区排序,需要重写:
comparaTor()、write()、readFiles()方法
b、writableComparator 用于分组排序,在reduce()执行前运行,需要重写:
compare(),继承bean的构造方法
2、数据分析
各个字段代表的含义:userId、dataTime、title、unitPrice、purchaseNums、orderId
13764633023 2014-12-01 02:20:42.000 全视目Allseelook 原宿风暴显色美瞳彩色隐形艺术眼镜1片 拍2包邮 33.6 2 18067781305
13377918580 2014-12-17 08:10:25.000 kilala可啦啦大美目大直径混血美瞳年抛彩色近视隐形眼镜2片包邮 19.8 2 17359010576
13532689063 2014-12-14 20:42:14.000 kilala可啦啦大美目大直径混血美瞳年抛彩色近视隐形眼镜2片包邮 19.8 2 17359010576
13856049592 2014-12-22 17:03:26.000 kilala可啦啦大美目大直径混血美瞳年抛彩色近视隐形眼镜2片包邮 19.6 2 17359010576
18056000601 2014-12-23 13:08:44.000 舒加美 甜甜圈糖果彩钻美瞳大直径年抛彩色隐形眼镜进口全国包邮 9.9 1 18224781070
13377918580 2014-12-16 08:30:06.000 EYEMAY艾魅美瞳大直径美瞳年抛彩色隐形近视眼镜包邮一片装送镜盒 19.6 2 17525034357
13377918580 2014-12-27 16:58:06.000 舒加美 大水凝天使冰蓝美瞳年抛大直径彩色隐形眼镜 一副填2包邮 20 2 19767079212
13856049592 2014-12-05 19:48:26.000 kilala可啦啦大美目大直径美瞳年抛彩色近视隐形眼镜2片包邮送盒 19.78 2 17359010576
13377918580 2014-12-12 14:05:12.000 爱漾美瞳钻石大梅花四叶草韩国进口彩色近视隐形眼镜混血2片包邮 9.9 1 40568171089
13856049592 2014-12-01 13:36:38.000 秀儿 验孕棒早早孕试纸20条+尿杯20 验孕试纸测试怀孕 测孕棒笔 9.9 1 35570005627
13764633023 2014-12-30 10:04:17.000 EYEMAY艾魅美瞳大直径美瞳年抛彩色隐形近视眼镜包邮一片装送镜盒 19.4 2 17525034357
18056000601 2014-12-06 21:51:41.000 卡乐芙蕾丝菠萝三色草莓大直径美瞳彩色隐形眼镜混血美瞳薄包邮 12 2 37961093174
13764633023 2014-12-21 15:37:50.000 卫康新视多功能近视隐形眼镜美瞳护理液 免搓洗除蛋白型355ml*3瓶 36 1 23209144069
13377918580 2014-12-28 08:23:59.000 EYEMAY艾魅美瞳大直径美瞳年抛彩色隐形近视眼镜包邮一片装送镜盒 19.4 2 17525034357
18056000601 2014-12-02 14:51:32.000 爱漾美瞳 菠萝三色 新品 彩色近视隐形眼镜自然年抛1片装2片包邮 18.8 1 42498389933
13856049592 2014-12-03 08:37:57.000 第六感避孕套92只 六合一冰火一体超薄安全套 成人情趣用品 包邮 48.9 1 24965336234
13377918580 2014-12-05 14:10:03.000 乐莎 菠萝三色 久美子日系混血美瞳年抛彩色隐形眼镜 拍2片包邮 19.98 2 17926831712
13377918580 2014-12-17 00:01:38.000 卡乐芙炫闪小布丁天使魅惑棒棒糖混血美瞳彩色隐形眼镜年抛薄包邮 19.6 2 38332450786
13532689063 2014-12-08 13:13:50.000 kilala可啦啦大美目大直径美瞳年抛彩色近视隐形眼镜2片包邮送盒 12.89 1 17359010576
13532689063 2014-12-17 19:45:00.000 kilala可啦啦大美目大直径混血美瞳年抛彩色近视隐形眼镜2片包邮 39.6 4 17359010576
13532689063 2014-11-14 20:42:14.000 kilala可啦啦大美目大直径混血美瞳年抛彩色近视隐形眼镜2片包邮 19.8 2 17359010576
13764633023 2014-11-11 22:11:25.000 秀儿 验孕棒早早孕试纸20条+尿杯20 验孕试纸测试怀孕 测孕棒笔 9.9 1 35570005627
13764633023 2014-11-12 21:28:42.000 自然美 美瞳年抛 天空之城混血小大直径彩色隐形眼镜1片装 包邮 59.8 2 18115271450
13764633023 2014-11-22 13:24:46.000 启恩 专利无烟 随身灸 纯铜艾灸盒 温灸器 5年陈艾艾柱 包邮 99 1 38644070489
13856049592 2014-11-23 01:56:53.000 瑞尔康芳姿黄红色写轮眼cosplay艺术片火影忍者美瞳彩色隐形眼镜 63 1 39814158438
13856049592 2014-10-03 08:37:57.000 第六感避孕套92只 六合一冰火一体超薄安全套 成人情趣用品 包邮 48.9 1 24965336234
3、bean对象
package com.xiaofei.mr.topN;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @author xxxx
* @date 2021.07.13
*/
public class OrderBeanCop implements WritableComparable<OrderBeanCop> {
private String userId;
private String dateTime;
private String title;
private double unitPrice;
private int purchaseNums;
@Override
public int compareTo(OrderBeanCop o) {
//变量之间做比较,而不是定值的常量之间比较
Double inPayFor = this.getUnitPrice() * this.getUnitPrice();
Double oPayFor = o.getUnitPrice() * o.getPurchaseNums();
int result = this.userId.compareTo(o.userId);
if(result==0){
int dateResult=this.dateTime.compareTo(o.dateTime);
if(dateResult==0){
int payResult = inPayFor.compareTo(oPayFor);
return -payResult;
}else {
return dateResult;
}
}else {
return result;
}
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(userId);
out.writeUTF(dateTime);
out.writeUTF(title);
out.writeDouble(unitPrice);
out.writeInt(purchaseNums);
}
@Override
public void readFields(DataInput in) throws IOException {
this.userId=in.readUTF();
this.dateTime=in.readUTF();
this.title=in.readUTF();
this.unitPrice=in.readDouble();
this.purchaseNums=in.readInt();
}
public String getuserId() {
return userId;
}
public void setuserId(String userId) {
this.userId = userId;
}
public String getDateTime() {
return dateTime;
}
public void setDateTime(String dateTime) {
this.dateTime = dateTime;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public double getUnitPrice() {
return unitPrice;
}
public void setUnitPrice(double unitPrice) {
this.unitPrice = unitPrice;
}
public int getPurchaseNums() {
return purchaseNums;
}
public void setPurchaseNums(int purchaseNums) {
this.purchaseNums = purchaseNums;
}
@Override
public String toString() {
return "OrderBeanCop{" +
"userId='" + userId + '\'' +
", dateTime='" + dateTime + '\'' +
", title='" + title + '\'' +
", unitPrice=" + unitPrice +
", purchaseNums=" + purchaseNums +
'}';
}
}
4、时间处理方法用于实现时间的处理
package com.xiaofei.mr.topN;
import org.apache.commons.lang3.time.DateFormatUtils;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
/**
* @author xxxx
* @date 2021.07.15
*/
public class OrderDateUtils {
public String yymmddFormat(String dataTime){
SimpleDateFormat dateFormat = new SimpleDateFormat("yyy-MM-dd");
Date date=null;
try {
date = dateFormat.parse(dataTime);
} catch (ParseException e) {
e.printStackTrace();
}
String format = DateFormatUtils.format(date, "yyyy-MM-dd");
return format;
}
}
5、主代码编写
package com.xiaofei.mr.topN;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
/**
* @author xxxx
* @date 2021.07.15
*/
public class OrderSort extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(),OrderSort.class.getSimpleName());
job.setJarByClass(OrderSort.class);
Path inPath=new Path("D:\\数据资料\\hadoop数据资料\\topN\\input\\tmall-201412-test.csv");
Path outPath=new Path("D:\\数据资料\\hadoop数据资料\\topN\\output");
FileSystem fileSystem = FileSystem.get(super.getConf());
if(fileSystem.exists(outPath)){
fileSystem.delete(outPath,true);
}
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,inPath);
job.setMapperClass(OrderMapper.class);
job.setMapOutputKeyClass(OrderBeanCop.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setPartitionerClass(OrderPartition.class);
job.setGroupingComparatorClass(OrderGroup.class);
job.setReducerClass(OrderReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,outPath);
return (job.waitForCompletion(true)?0:1);
}
public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new Configuration(), new OrderSort(), args);
System.exit(run);
}
public static class OrderMapper extends Mapper<LongWritable, Text,OrderBeanCop, DoubleWritable>{
private OrderBeanCop orderBean;
private DoubleWritable outValue;
private OrderDateUtils dateUtils;
@Override
protected void setup(Context context) {
orderBean=new OrderBeanCop();
outValue=new DoubleWritable();
dateUtils = new OrderDateUtils();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] splits = value.toString().split("\t");
orderBean.setuserId(splits[0]);
orderBean.setDateTime(dateUtils.yymmddFormat(splits[1]));
orderBean.setTitle(splits[2]);
orderBean.setUnitPrice(Double.parseDouble(splits[3]));
orderBean.setPurchaseNums(Integer.parseInt(splits[4]));
//double payPrice = Double.parseDouble(splits[3]) * Integer.parseInt(splits[4]);
double payPrice = orderBean.getUnitPrice() * orderBean.getPurchaseNums();
outValue.set(payPrice);
context.write(orderBean,outValue);
}
}
//分区
public static class OrderPartition extends Partitioner<OrderBeanCop,NullWritable>{
@Override
public int getPartition(OrderBeanCop orderBeanCop, NullWritable nullWritable, int numPartitions) {
String userId = orderBeanCop.getuserId();
return (userId.hashCode() & Integer.MAX_VALUE) %numPartitions;
}
}
//排序
public static class OrderGroup extends WritableComparator{
public OrderGroup(){
super(OrderBeanCop.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBeanCop aOrderBean = (OrderBeanCop) a;
OrderBeanCop bOrderBean = (OrderBeanCop) b;
int comResult = aOrderBean.getuserId().compareTo(bOrderBean.getuserId());
if(comResult==0){
return aOrderBean.getDateTime().compareTo(bOrderBean.getDateTime());
}else {
return comResult;
}
}
}
public static class OrderReducer extends Reducer<OrderBeanCop,DoubleWritable,Text,DoubleWritable>{
@Override
protected void reduce(OrderBeanCop key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
int num = 0;
for (DoubleWritable value : values) {
if (num<2){
String userId = key.getuserId()+" "+key.getDateTime();
num++;
context.write(new Text(userId),value);
}else {
break;
}
}
}
}
}
6、运行的结果