MapReduce编程小案例.6th—高效求分组topn的代码实现

最新推荐文章于 2024-08-13 16:48:41 发布

RobertDowneyLm

最新推荐文章于 2024-08-13 16:48:41 发布

阅读量5.3k

点赞数 1

分类专栏：学习笔记干货教程文章标签：大数据 MapReduce

本文链接：https://blog.csdn.net/robertdowneylm/article/details/80327522

版权

学习笔记同时被 2 个专栏收录

61 篇文章 0 订阅

订阅专栏

干货教程

57 篇文章 0 订阅

订阅专栏

MapReduce编程小案例.6th—高效求分组topn的代码实现

需求：有如下一组数据：

order001,u001,小米6,1999.9,2  
order001,u001,雀巢咖啡,99.0,2  
order001,u001,安慕希,250.0,2  
order001,u001,经典红双喜,200.0,4  
order001,u001,防水电脑包,400.0,2  
order002,u002,小米手环,199.0,3  
order002,u002,榴莲,15.0,10  
order002,u002,苹果,4.5,20  
order002,u002,肥皂,10.0,40  
order003,u001,小米6,1999.9,2  
order003,u001,雀巢咖啡,99.0,2  
order003,u001,安慕希,250.0,2  
order003,u001,经典红双喜,200.0,4  
order003,u001,防水电脑包,400.0,2

需要得到如下数据：

order001,u001,小米6,1999.9,2,3999.8  
order001,u001,防水电脑包,400.0,2,800.0  
order001,u001,经典红双喜,200.0,4,800.0  
order003,u001,小米6,1999.9,2,3999.8  
order003,u001,经典红双喜,200.0,4,800.0  
order003,u001,防水电脑包,400.0,2,800.0  
order002,u002,小米手环,199.0,3,597.0  
order002,u002,肥皂,10.0,40,400.0  
order002,u002,榴莲,15.0,10,150.0

把同一个orderID的数据分组，并且列出前三项花费最多的数据行

高效求分组topn的代码实现如下：

1.OrderBean类代码

package cn.edu360.mr.order.topn.grouping;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;


import org.apache.hadoop.io.WritableComparable;

public class OrderBean implements WritableComparable<OrderBean>{
	
	private String orderId;
	private String userId;
	private String pdtName;
	private float price;
	private int number;
	private float amountFee;
	
	
	
	public void set(String orderId, String userId, String pdtName, float price, int number) {
		this.orderId = orderId;
		this.userId = userId;
		this.pdtName = pdtName;
		this.price = price;
		this.number = number;
		this.amountFee = price * number;
	}
	
	public String getOrderId() {
		return orderId;
	}
	public void setOrderId(String orderId) {
		this.orderId = orderId;
	}
	public String getUserId() {
		return userId;
	}
	public void setUserId(String userId) {
		this.userId = userId;
	}
	public String getPdtName() {
		return pdtName;
	}
	public void setPdtName(String pdtName) {
		this.pdtName = pdtName;
	}
	public float getPrice() {
		return price;
	}
	public void setPrice(float price) {
		this.price = price;
	}
	public int getNumber() {
		return number;
	}
	public void setNumber(int number) {
		this.number = number;
	}
	public float getAmountFee() {
		return amountFee;
	}
	public void setAmountFee(float amountFee) {
		this.amountFee = amountFee;
	}
	
	@Override
	public String toString() {
		
		return this.orderId + "," +this.userId + "," +this.pdtName +"," + this.price + "," +this.number + "," +this.amountFee;

	}

	public void readFields(DataInput in) throws IOException {
		
		this.orderId = in.readUTF();
		this.userId = in.readUTF();
		this.pdtName = in.readUTF();
		this.price = in.readFloat();
		this.number = in.readInt();
		this.amountFee = this.price * this.number ;
        		
	}

	public void write(DataOutput out) throws IOException {
		
		out.writeUTF(this.orderId);
		out.writeUTF(this.userId);
		out.writeUTF(this.pdtName);
		out.writeFloat(this.price);
		out.writeInt(this.number);
	}

	public int compareTo(OrderBean o) {

		return this.orderId.compareTo(o.getOrderId()) == 0 ? Float.compare(o.getAmountFee(), this.getAmountFee()):this.orderId.compareTo(o.getOrderId());
		
	}
	
	
	

}

2.OrderIdPartitioner类

package cn.edu360.mr.order.topn.grouping;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class OrderIdPartitioner extends Partitioner<OrderBean, NullWritable>{
	
	@Override
	public int getPartition(OrderBean key, NullWritable value, int numPartitions) {
		
		return (key.getOrderId().hashCode() & Integer.MAX_VALUE) % numPartitions;
	}

}

3.OrderIdGroupingComparator类

package cn.edu360.mr.order.topn.grouping;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class OrderIdGroupingComparator extends WritableComparator{
	
	public OrderIdGroupingComparator() {
		super(OrderBean.class,true);

	}
	
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		
		OrderBean o1 = (OrderBean)a;
		OrderBean o2 = (OrderBean)b;
		
		return o1.getOrderId().compareTo(o2.getOrderId());
	}

}

4.最后的MapReduce主类实现：

package cn.edu360.mr.order.topn.grouping;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;



public class OrderTopn {
	
	public static class OrderTopnMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>{
		
		OrderBean orderBean = new OrderBean();
		NullWritable v = NullWritable.get();
		
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, OrderBean, NullWritable>.Context context)
				throws IOException, InterruptedException {
			String[] fields = value.toString().split(",");
			
			orderBean.set(fields[0], fields[1], fields[2], Float.parseFloat(fields[3]), Integer.parseInt(fields[4]));
			context.write(orderBean, v);
			
		}
	}
	
	
	public static class OrderTopnReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable>{
		/*
		 * 虽然reduce方法中的参数key只有一个，但是只要迭代器迭代一次，key的值就会变化
		 */
		@Override
		protected void reduce(OrderBean key, Iterable<NullWritable> values,
				Reducer<OrderBean, NullWritable, OrderBean, NullWritable>.Context context)
				throws IOException, InterruptedException {
			int i = 0;
			
			for (NullWritable v : values) {
				context.write(key, v);

				if(++i == 3) return ;
			}
			
		}
		
	}
	
    public static void main(String[] args) throws Exception {

		
		Configuration conf = new Configuration(); // 默认只加载core-default.xml core-site.xml
		conf.setInt("order.top.n", 2);
		
		Job job = Job.getInstance(conf);

		job.setJarByClass(OrderTopn.class);

		job.setMapperClass(OrderTopnMapper.class);
		job.setReducerClass(OrderTopnReducer.class);
		
		
		job.setPartitionerClass(OrderIdPartitioner.class);
		job.setGroupingComparatorClass(OrderIdGroupingComparator.class);
		
		job.setNumReduceTasks(2);

		job.setMapOutputKeyClass(OrderBean.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		job.setOutputKeyClass(OrderBean.class);
		job.setOutputValueClass(NullWritable.class);

		FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\order\\input"));
		FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\order\\out2"));

		job.waitForCompletion(true);
	}

}