hadoop二次排序

最新推荐文章于 2020-09-29 11:24:18 发布

izzzyx

最新推荐文章于 2020-09-29 11:24:18 发布

阅读量314

点赞数

分类专栏：大数据

本文链接：https://blog.csdn.net/izzzyx/article/details/53123725

版权

大数据专栏收录该内容

5 篇文章 0 订阅

订阅专栏

package example;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

public class SecondSort {
	
	// 假设我们需要对气温记录（key：年份，value：气温）进行排序，要求按年份升序并按气温降序
	// 由于hadoop只会给key排序，并且只能保证每一个reduce收到的数据按key有序
	// 所以需要设置两个方面：1.把key和value拼起来，并且定义为自定义类型对象，保证每个分区内按自定义排序规则有序
	// 这种组合key虽然保证了分区内有序，但也由于组合键的原因，无法保证同个年份在同个分区（1970+30C°和1970+31C°显然不是相同的key，而hadoop只会将相同的key传到同一个reducer）
	// 2.为了保证全局有序，把reduce的数量设为1
	
	// 首先编写组合键的类型，集成Writable接口和Comparable接口，并重写compareTo方法
	public class CombineKey implements WritableComparable<CombineKey>{

		private Text year;
		private IntWritable temperature;
		
		public CombineKey(Text year, IntWritable temperature){
			this.year = year;
			this.temperature = temperature;
		}
		@Override
		public void readFields(DataInput arg0) throws IOException {
			// TODO Auto-generated method stub
			year.readFields(arg0);
			temperature.readFields(arg0);
		}

		@Override
		public void write(DataOutput arg0) throws IOException {
			// TODO Auto-generated method stub
			year.write(arg0);
			temperature.write(arg0);
		}

		// 按value升序排序，分区的时候再按year分区
		@Override
		public int compareTo(CombineKey combineKey) {
			// TODO Auto-generated method stub
			return temperature.get() - combineKey.temperature.get();
		}
		
		public Text getYear(){
			return year;
		}
	}
	
	public class KeyPartioner extends Partitioner<CombineKey, IntWritable>{

		@Override
		public int getPartition(CombineKey arg0, IntWritable arg1, int arg2) {
			// TODO Auto-generated method stub
			return new HashPartitioner().getPartition(arg0.getYear(), arg1, arg2);
		}
		
	}
	
	public static void main(String args[]) throws IOException, ClassNotFoundException, InterruptedException{
		// 省略mapper和reducer以及输入输出格式、路径的设置
		Job job = new Job();
		job.setJarByClass(SecondSort.class);
		job.setMapOutputKeyClass(CombineKey.class);
		job.setPartitionerClass(KeyPartioner.class);
		
		job.waitForCompletion(true);
	}
}

izzzyx

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hadoop二次排序

package example;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.
复制链接

扫一扫