MapReduce SecondarySort

最新推荐文章于 2024-10-09 15:12:25 发布
王俊杰MSE
最新推荐文章于 2024-10-09 15:12:25 发布
阅读量537
点赞数
分类专栏： hadoop 文章标签： mapreduce hadoop
本文链接：https://blog.csdn.net/dang_wang/article/details/16349105
版权
hadoop 专栏收录该内容
2 篇文章 0 订阅
订阅专栏
package wjj;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
 
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;

public class SecondarySort {
	
	
	/*
	 * 这里新定义的类型为IntPair封装了两个个int型，
	 * 依次存放两次排序的value。Hadoop要求key的类型必须实现Writable和Comparable，前者为了支持序列化和反序列化，后者为了实现基于比较的排序。
	 * 需要注意的是compareTo()方法中先按first升序排列，后按second排列。
	 * 我们可以构造一个复合类IntPair，他有两个字段，先利用分区对第一字段排序，再利用分区内的比较对第二字段排序。
       所有自定义的key应该实现接口WritableComparable，因为是可序列的并且可比较的
	 * 
	 * 
	 * */
	
	public  static  class IntPair implements WritableComparable<IntPair>{

		int first;
		int second;
		
		public void set(int left,int right)
		{
			first=left;
			second=right;
		}
		
		
		
		public int getFirst() {
			return first;
		}



		public void setFirst(int first) {
			this.first = first;
		}



		public int getSecond() {
			return second;
		}



		public void setSecond(int second) {
			this.second = second;
		}


		 //反序列化，从流中的二进制转换成IntPair  
		@Override
		public void readFields(DataInput in) throws IOException {
			// TODO Auto-generated method stub
			first=in.readInt();
			second=in.readInt();
		}
		 //序列化，将IntPair转化成使用流传送的二进制  
		@Override
		public void write(DataOutput out) throws IOException {
			// TODO Auto-generated method stub
			out.writeInt(first);
            out.writeInt(second);//这里有的地方写的是write()方法，但是我在实际的测试过程中会抛出异常
		}

		@Override
		public int compareTo(IntPair o) {
			// TODO Auto-generated method stub
			if(first!=o.first)
				return first<o.first?-1:1;
			else if(second!=o.second)
				return second<o.second?-1:1;
				 
			else return 0;
		}
			
}
	/*
	 * 定义key后还不能满足需求。因为默认的HashPartitioner会将相同的key分配给同一个reduce，
	 * 而我们希望的是first相同的key分给同一个reduce处理，默认的Partitioner显然保证不了这一点。
	 * 这就需要我们自定义Partitioner，实现first相同的key分配给同一个reduce。
	 * 只考虑first，不考虑second，这样就满足了我们的需求。这是key的第一次比较。
	 * */
	public static class FirstPartitioner extends Partitioner<IntPair,IntWritable>{

		@Override
		public int getPartition(IntPair key, IntWritable value, int num) {
			// TODO Auto-generated method stub
			return Math.abs(key.getFirst()*127)%num;
			 
		}
		
	}
	/*
	 * 而我们希望first相同的key中，只获取第一个的second即可，
	 * 其他数据可以忽略。这就需要数据执行reduce前按照key的first字段
	 * 进行归并，即grouping。first相同的key归为一个group，
	 * 将第一个key和所有的value传给reduce()方法。
	 * 然后reduce将key输出即可实现目的。
	 * 为了实现这样的grouping操纵，需要自定义归并比较器
	 * （ValueGroupingComparator）， 
	 * */
	
	/*//第一种方法，实现接口RawComparator 
    public static class GroupingComparator implements RawComparator<IntPair> { 
        @Override 
        public int compare(IntPair o1, IntPair o2) { 
            int l = o1.getFirst(); 
            int r = o2.getFirst(); 
            return l == r ? 0 : (l < r ? -1 : 1); 
        } 
        @Override 
        //一个字节一个字节的比，直到找到一个不相同的字节，然后比这个字节的大小作为两个字节流的大小比较结果。 
        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){ 
            // TODO Auto-generated method stub 
             return WritableComparator.compareBytes(b1, s1, Integer.SIZE/8,  
                     b2, s2, Integer.SIZE/8); 
        } 
    }*/  
	 public static class GroupingComparator extends WritableComparator
	    {
	        protected GroupingComparator()
	        {
	            super(IntPair.class, true);
	        }
	        @Override
	        //Compare two WritableComparables.
	        public int compare(WritableComparable w1, WritableComparable w2)
	        {
	            IntPair ip1 = (IntPair) w1;
	            IntPair ip2 = (IntPair) w2;
	            int l = ip1.getFirst();
	            int r = ip2.getFirst();
	            return l == r ? 0 : (l < r ? -1 : 1);
	        }
	    }

	
	
	public static class Map extends Mapper<LongWritable,Text,IntPair,IntWritable>
	{
		private final IntPair intkey=new IntPair();
		private final IntWritable intvalue=new IntWritable();
		
		public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException
		{
			String line=value.toString();
			StringTokenizer tokenizer=new StringTokenizer(line);
			int left=0;
			int right=0;
			if(tokenizer.hasMoreTokens())
			{
				left=Integer.parseInt(tokenizer.nextToken());
				if(tokenizer.hasMoreTokens())
				{
					right=Integer.parseInt(tokenizer.nextToken());
				}
				intkey.set(left, right);
				intvalue.set(right);
				context.write(intkey, intvalue);
			}
		}
	}
	
	
	public static class Reduce extends Reducer<IntPair,IntWritable,Text,IntWritable>
	{
		private final Text left=new Text();
		private static  final Text SEPARATOR=new Text("--------------------");
		public void reduce(IntPair key,Iterable<IntWritable>values,Context context) throws IOException, InterruptedException
		{
			context.write(SEPARATOR, null);
			left.set(Integer.toString(key.getFirst()));
			for(IntWritable val:values)
			{
				context.write(left, val);
			}
		}
	}
	
	public static void main(String[] args) throws Exception {
	   Configuration conf=new Configuration();
	   @SuppressWarnings("deprecation")
	   // 实例化一道作业  
	   Job job=new Job(conf,"secondarysort");
	   job.setJarByClass(SecondarySort.class);
	   
	   //Mapper
	   job.setMapperClass(Map.class);
	   //Reducer
	   // 不再需要Combiner类型，因为Combiner的输出类型<Text, IntWritable>对Reduce的输入类型<IntPair, IntWritable>不适用  
       //job.setCombinerClass(Reduce.class);  
	   job.setReducerClass(Reduce.class);
	   
	   job.setPartitionerClass(FirstPartitioner.class);
	   job.setGroupingComparatorClass(GroupingComparator.class);
	   
	   job.setMapOutputKeyClass(IntPair.class);
	   job.setOutputKeyClass(Text.class);
	   
	   job.setOutputValueClass(IntWritable.class);
	   
	   job.setInputFormatClass(TextInputFormat.class);
	   job.setOutputFormatClass(TextOutputFormat.class);
	   
	   FileInputFormat.setInputPaths(job, new Path(args[0]));
	   FileOutputFormat.setOutputPath(job, new Path(args[1]));
	   System.exit(job.waitForCompletion(true)?0:1);
	}
	
}