MapReduce-自定义Key-二次排序

最新推荐文章于 2023-03-07 23:40:20 发布

doegoo

最新推荐文章于 2023-03-07 23:40:20 发布

阅读量3.7k

点赞数 1

分类专栏： mapreduce MapReduce 文章标签： mapreduce hadoop

本文链接：https://blog.csdn.net/doegoo/article/details/50370103

版权

mapreduce 同时被 2 个专栏收录

14 篇文章 0 订阅

订阅专栏

MapReduce

13 篇文章 5 订阅

订阅专栏

这个实例紧接上一个TopK的实例最后留下的一个问题的解决以及对新的一个技术点的说明，如何自定义输入输出的数据类型，这里也大概引出mapreduce中二次排序的大致思想，但不着重说明二次排序，只是大致说明自定义输入类型的基本步骤，因为做刚接触二次排序的时候当时陷入一个思想上的误区，为了把这个过程记录下来，所以会在下一篇博客中着重说明二次排序，为了说明问题我把他说成是“三次排序”可参见《 MapReduce-三次排序-曾经想不通的二次排序》。

自定义Key的基本步骤：
所有自定义的key应该实现接口WritableComparable，因为是可序列的并且可比较的。并重载方法
//反序列化，从流中的二进制转换成自定义Key
public void readFields(DataInput in) throws IOException
//序列化，将自定义Key转化成使用流传送的二进制
public void write(DataOutput out)
//key的比较，用于map阶段和reduce阶段的排序以及用于reduce阶段的grouping分组
public int compareTo(IntPair o)
另外新定义的类应该重写的两个方法
//The hashCode() method is used by the HashPartitioner (the default partitioner in MapReduce)
public int hashCode()
public boolean equals(Object right)

根据以上步骤下面是实现代码：

自定义Key：

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;

public class SecondSortClass implements WritableComparable<SecondSortClass> {
	/**
	 * 自定义类型的中包含的变量，本例中的变量都是用于排序的变量
	 * 后序的事例中我们还将定义一些其它功能的变量
	 */
	private int first;
	private String second;
	public SecondSortClass() {}
	
	public SecondSortClass(int first, String second) {
		this.first = first;
		this.second = second;
	}
	/**
	 * 反序列化，从流中的二进制转换成自定义Key
	 */
	@Override
	public void readFields(DataInput input) throws IOException {
		this.first = input.readInt();
		this.second = input.readUTF();
	}
	/**
	 * 序列化，将自定义Key转化成使用流传送的二进制 
	 */
	@Override
	public void write(DataOutput output) throws IOException {
		output.writeInt(first);
		output.writeUTF(second);
	}
	
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + first;
		result = prime * result + ((second == null) ? 0 : second.hashCode());
		return result;
	}
	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		SecondSortClass other = (SecondSortClass) obj;
		if (first != other.first)
			return false;
		if (second == null) {
			if (other.second != null)
				return false;
		} else if (!second.equals(other.second))
			return false;
		return true;
	}
	/**
	 * 用于map阶段和reduce阶段的排序 以及用于reduce阶段的grouping分组
	 */
	@Override
	public int compareTo(SecondSortClass o) {
		if(this.first != o.getFirst()) {
			return -(this.first - o.getFirst());
		} else if( !this.second.equals(o.getSecond())) {
			return -this.second.compareTo(o.getSecond());
		} 
		return 0;
	}
	public int getFirst() {
		return first;
	}
	public void setFirst(int first) {
		this.first = first;
	}
	public String getSecond() {
		return second;
	}
	public void setSecond(String second) {
		this.second = second;
	}
}

map阶段：

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class SecondMapper extends Mapper<LongWritable, Text, SecondSortClass, Text> {

	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String line = value.toString().trim();
		if(line.length() > 0) {
			String[] arr = line.split(",");
			if(arr.length == 3) {
				context.write(new SecondSortClass(Integer.valueOf(arr[2]),arr[1]), new Text(arr[1] + "," + arr[2]));
			}
		}
	}
}

reduce阶段：

import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;


public class SecondReducer extends Reducer<SecondSortClass, Text, NullWritable, Text> {
	int len;


	/**
	 * Map任务启动的时候调用
	 */
	@Override
	protected void setup( Context context)
			throws IOException, InterruptedException {
		/**
		 * 通过context获取任务启动时传入的TopK的K值
		 */
		len = context.getConfiguration().getInt("K", 10);
	}
	@Override
	protected void reduce(SecondSortClass key, Iterable<Text> values, Context context)
			throws IOException, InterruptedException {
//		for(Text val: values) {
//			if(len <= 0) {
//				break;
//			}
//			context.write(null, val);
//			len --;
//		}
		if(len > 0) {
			context.write(null, values.iterator().next());
			len --;
		}
	}
}

启动函数：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class JobMain {
	public static void main(String[] args) throws Exception{
		Configuration configuration = new Configuration();
		/**
		 * 把传入参数放入Configuration中，map或reduce中可以通过
		 * 获取Configuration来获取传入的参数，这是hadoop传入参数的
		 * 方式之一
		 */
		configuration.set("K", args[2]);
		Job job = new Job(configuration, "third-sort-job");
		job.setJarByClass(JobMain.class);
		job.setMapperClass(SecondMapper.class);
		job.setMapOutputKeyClass(SecondSortClass.class);
		job.setMapOutputValueClass(Text.class);
		job.setReducerClass(SecondReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		Path outputDir = new Path(args[1]);
		FileSystem fs = FileSystem.get(configuration);
		if(fs.exists(outputDir)) {
			fs.delete(outputDir, true);
		}
		FileOutputFormat.setOutputPath(job, outputDir);
		System.exit(job.waitForCompletion(true)? 0: 1);
	}
}

运行命令：

./hadoop jar mr.jar com.seven.mapreduce.test1.JobMain /input/two /output/two14 3

运行数据：

uid,name,cost
1,mr1,3234
2,mr2,123
3,mr3,9877
4,mr4,348
5,mr5,12345
6,mr6,6646
7,mr7,98
8,mr8,12345

运行结果：