package example;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
public class SecondSort {
// 假设我们需要对气温记录(key:年份,value:气温)进行排序,要求按年份升序并按气温降序
// 由于hadoop只会给key排序,并且只能保证每一个reduce收到的数据按key有序
// 所以需要设置两个方面:1.把key和value拼起来,并且定义为自定义类型对象,保证每个分区内按自定义排序规则有序
// 这种组合key虽然保证了分区内有序,但也由于组合键的原因,无法保证同个年份在同个分区(1970+30C°和1970+31C°显然不是相同的key,而hadoop只会将相同的key传到同一个reducer)
// 2.为了保证全局有序,把reduce的数量设为1
// 首先编写组合键的类型,集成Writable接口和Comparable接口,并重写compareTo方法
public class CombineKey implements WritableComparable<CombineKey>{
private Text year;
private IntWritable temperature;
public CombineKey(Text year, IntWritable temperature){
this.year = year;
this.temperature = temperature;
}
@Override
public void readFields(DataInput arg0) throws IOException {
// TODO Auto-generated method stub
year.readFields(arg0);
temperature.readFields(arg0);
}
@Override
public void write(DataOutput arg0) throws IOException {
// TODO Auto-generated method stub
year.write(arg0);
temperature.write(arg0);
}
// 按value升序排序,分区的时候再按year分区
@Override
public int compareTo(CombineKey combineKey) {
// TODO Auto-generated method stub
return temperature.get() - combineKey.temperature.get();
}
public Text getYear(){
return year;
}
}
public class KeyPartioner extends Partitioner<CombineKey, IntWritable>{
@Override
public int getPartition(CombineKey arg0, IntWritable arg1, int arg2) {
// TODO Auto-generated method stub
return new HashPartitioner().getPartition(arg0.getYear(), arg1, arg2);
}
}
public static void main(String args[]) throws IOException, ClassNotFoundException, InterruptedException{
// 省略mapper和reducer以及输入输出格式、路径的设置
Job job = new Job();
job.setJarByClass(SecondSort.class);
job.setMapOutputKeyClass(CombineKey.class);
job.setPartitionerClass(KeyPartioner.class);
job.waitForCompletion(true);
}
}
hadoop二次排序
最新推荐文章于 2020-09-29 11:24:18 发布