Mapreduce之二次排序
二次排序问题的解决方案
归约器值排序至少有两种解决方案:
- 第一种方案是让归约器读取和缓存给定键的所有值,然后对这些值完成一个归约器中排序,这种方法不具有可伸缩性,因为归约器要接收一个给定键的所有值,这种方法可能导致归约器耗尽内存,如果值的数量很少,那么这个方法是适用的
- 第二种方案是使用MapReduce框架对归约器值排序,这种方法“会为自然键增加部分或整个值来创建一个组合键以实现排序目标”,这种方法是可伸缩的,不会产生内存溢出错误
二次排序小结
- 使用键值转换的设计模式
- 让MapReduce执行框架完成排序
- 保留多个键-值对的状态来完成处理,可以利用映射器输出分区器来实现这一点
实现细节
文章以天气数据集为例,按照月份对温度进行排序
天气数据集网址: 天气数据集
运行结果
中间键的排序顺序阶段
要实现二次排序,需要控制中间键的排序顺序,以及归约器处理键的顺序。
在以天气数据集为案例的分析中,先设置类 TemperaturePair ,用compartTo()方法指出如何对 TemperaturePair对象排序,在Hadoop中如果需要持久存储定制数据类型则必须实现Writable借口,如果要比较定制数据类型,还必须实现另外一个接口 WritableComparable
中间键的排序顺序阶段代码如下
public static class TemperaturePair implements Writable,WritableComparable<TemperaturePair>{
private String yearMonth;
private Integer temperature;
public String getYearMonth() {
return yearMonth;
}
public void setYearMonth(String yearMonth) {
this.yearMonth = yearMonth;
}
public Integer getTemperature() {
return temperature;
}
public void setTemperature(Integer temperature) {
this.temperature = temperature;
}
public int compareTo(TemperaturePair o) {
int compareValue=this.yearMonth.compareTo(o.getYearMonth());
if(compareValue==0){
compareValue=temperature.compareTo(o.getTemperature());
}
return compareValue;
}
public void write(DataOutput dataOutput) throws IOException {
Text.writeString(dataOutput,yearMonth);
dataOutput.writeInt(temperature);
}
public void readFields(DataInput dataInput) throws IOException {
this.yearMonth=Text.readString(dataInput);
this.temperature=dataInput.readInt();
}
}
Partitioner阶段任务
在这个阶段分区器会根据映射器的输出键来决定那个映射器输出发送到哪个归约器
Partitioner阶段编码
public static class TemperaturePartition extends Partitioner<TemperaturePair,Text>{
@Override
public int getPartition(TemperaturePair pair,Text text,int numberOfPartitions){
return Math.abs(pair.getYearMonth().hashCode()%numberOfPartitions);
}
}
GroupingComparator阶段任务
该阶段主要控制哪些键分组到一个Reducer.reducer()函数调用
GroupingComparator阶段编码
public static class TemperatureGroupingComparator extends WritableComparator{
public TemperatureGroupingComparator(){
super(TemperaturePair.class,true);
}
@Override
public int compare(WritableComparable wc1,WritableComparable wc2){
TemperaturePair pair=(TemperaturePair) wc1;
TemperaturePair pair2 =(TemperaturePair) wc2;
return pair.getYearMonth().compareTo(pair2.getYearMonth());
}
}
Mapper阶段任务
该阶段主要是对数据进行分割,然后将值输入归约器的键
Mapper阶段编码如下
public static class TemperatureMapper extends Mapper<LongWritable,Text,TemperaturePair,IntWritable>{
private static final int MISSING=9999;
public void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException{
String line=value.toString();
String yearMonth=line.substring(5,19)+'-'+line.substring(19,21);
String day=line.substring(21,23);
int airTemperature;
if(line.charAt(87)=='+'){
airTemperature=Integer.parseInt(line.substring(88,92));
}else{
airTemperature=Integer.parseInt(line.substring(87,92));
}
TemperaturePair reduceKey=new TemperaturePair();
reduceKey.setYearMonth(yearMonth);
reduceKey.setTemperature(airTemperature);
String quality=line.substring(92,93);
if(airTemperature!=MISSING&&quality.matches("[01459]")){
context.write(reduceKey,new IntWritable(airTemperature));
}
}
}
reduce阶段任务
归约器的朱函数将值连接在一起,然后输出
reduce阶段编码如下
public static class TemperatureReducer extends Reducer<TemperaturePair,IntWritable,Text,Text>{
public void reduce(TemperaturePair key,Iterable<IntWritable> values,Context context) throws IOException,InterruptedException{
StringBuilder sortedTemperatureList=new StringBuilder();
for(IntWritable temperature:values){
sortedTemperatureList.append(temperature);
sortedTemperatureList.append(",");
}
context.write(new Text(key.getYearMonth()),new Text(sortedTemperatureList.toString()));
}
}
完整代码如下
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.lang.InterruptedException;
public class SecondarySort {
public static class TemperaturePair implements Writable,WritableComparable<TemperaturePair>{
private String yearMonth;
private Integer temperature;
public String getYearMonth() {
return yearMonth;
}
public void setYearMonth(String yearMonth) {
this.yearMonth = yearMonth;
}
public Integer getTemperature() {
return temperature;
}
public void setTemperature(Integer temperature) {
this.temperature = temperature;
}
public int compareTo(TemperaturePair o) {
int compareValue=this.yearMonth.compareTo(o.getYearMonth());
if(compareValue==0){
compareValue=temperature.compareTo(o.getTemperature());
}
return compareValue;
}
public void write(DataOutput dataOutput) throws IOException {
Text.writeString(dataOutput,yearMonth);
dataOutput.writeInt(temperature);
}
public void readFields(DataInput dataInput) throws IOException {
this.yearMonth=Text.readString(dataInput);
this.temperature=dataInput.readInt();
}
}
public static class TemperaturePartition extends Partitioner<TemperaturePair,Text>{
@Override
public int getPartition(TemperaturePair pair,Text text,int numberOfPartitions){
return Math.abs(pair.getYearMonth().hashCode()%numberOfPartitions);
}
}
public static class TemperatureGroupingComparator extends WritableComparator{
public TemperatureGroupingComparator(){
super(TemperaturePair.class,true);
}
@Override
public int compare(WritableComparable wc1,WritableComparable wc2){
TemperaturePair pair=(TemperaturePair) wc1;
TemperaturePair pair2 =(TemperaturePair) wc2;
return pair.getYearMonth().compareTo(pair2.getYearMonth());
}
}
public static class TemperatureMapper extends Mapper<LongWritable,Text,TemperaturePair,IntWritable>{
private static final int MISSING=9999;
public void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException{
String line=value.toString();
String yearMonth=line.substring(5,19)+'-'+line.substring(19,21);
String day=line.substring(21,23);
int airTemperature;
if(line.charAt(87)=='+'){
airTemperature=Integer.parseInt(line.substring(88,92));
}else{
airTemperature=Integer.parseInt(line.substring(87,92));
}
TemperaturePair reduceKey=new TemperaturePair();
reduceKey.setYearMonth(yearMonth);
reduceKey.setTemperature(airTemperature);
String quality=line.substring(92,93);
if(airTemperature!=MISSING&&quality.matches("[01459]")){
context.write(reduceKey,new IntWritable(airTemperature));
}
}
}
public static class TemperatureReducer extends Reducer<TemperaturePair,IntWritable,Text,Text>{
public void reduce(TemperaturePair key,Iterable<IntWritable> values,Context context) throws IOException,InterruptedException{
StringBuilder sortedTemperatureList=new StringBuilder();
for(IntWritable temperature:values){
sortedTemperatureList.append(temperature);
sortedTemperatureList.append(",");
}
context.write(new Text(key.getYearMonth()),new Text(sortedTemperatureList.toString()));
}
}
public static void main(String[] args) throws Exception{
FileUtil.deleteDir("output");
Configuration conf=new Configuration();
String[] otherArgs=new String[]{"input/file.txt","output"};
if(otherArgs.length!=2){
System.err.println("参数错误");
System.exit(2);
}
Job job= new Job(conf,"Ncdsc");
FileInputFormat.addInputPath(job,new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job,new Path(otherArgs[1]));
job.setJarByClass(SecondarySort.class);
job.setMapperClass(TemperatureMapper.class);
job.setPartitionerClass(TemperaturePartition.class);
job.setGroupingComparatorClass(TemperatureGroupingComparator.class);
job.setReducerClass(TemperatureReducer.class);
job.setMapOutputKeyClass(TemperaturePair.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputKeyClass(TemperaturePair.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
写在最后
二次排序的过程并不难,对数据进行分割后注入到归约器的键中,TemperaturePair.compareTo()控制键的排序顺序,TemperaureGroupingComparator.compareTo()控制哪些键分组到一个reduce()方法调用,这样就完成了二次排序