mapreduce 二次排序的两种方法

最新推荐文章于 2023-11-09 21:48:15 发布

zzd0007

最新推荐文章于 2023-11-09 21:48:15 发布

阅读量833

点赞数

分类专栏： hadoop 文章标签： mapreduce

本文链接：https://blog.csdn.net/qq_32563713/article/details/75632447

版权

hadoop 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

什么是二次排序？

现在有一个文件secondrysort_in.txt,

20 21
50 51
50 52
50 53
60 51
60 53
60 52
70 55
7 8

现在你对文件进行排序，按照第一个数从小到大排列，若果第一个数相同，则按照第二个数从小到大排列。

输出文件secondrysort_out

7   8
20   21
50   51
50   52
50   53
60   51
60   52
60   53
70   55

方法一：

这是网上比较常见的方法：自定义map的输出类型，在输出类型的类中指定好排序规则。

package SecondrySort_examples;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import SecondrySort_examples.SecondrySort.SecondrySortMapper;
import SecondrySort_examples.SecondrySort.SecondrySortReducer;

//二次排序
public class SecondrySort1 {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf=new Configuration();
		conf.set("mapred.jpb.tracker", "localhost:9001");
		String arg[]=new GenericOptionsParser(conf, new String[]{"/secondrysort_in","/secondrysort1_out"}).getRemainingArgs();
		if(arg.length!=2){
			System.err.println("in out file error!");
			System.exit(1);
		}
		FileSystem fs=FileSystem.get(conf);
		fs.delete(new Path(arg[1]), true);
		
		Job job=new Job(conf,"SecondrySort1_job");
		job.setJarByClass(SecondrySort1.class);
		job.setMapperClass(MyMapper.class);
		job.setReducerClass(MyReducer.class);
		
		//设置分组函数类 可以不用指定分组类，因为MR有默认的分组操作，也就是文件归并，归并时会将具有相同的key的多个value归并成一个迭代器 <"a",1>,<"a",2>---><"a",<1,2>>
		//job.setGroupingComparatorClass(GroupingComparator.class);
		
		job.setMapOutputKeyClass(IntPair.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		//job.setSortComparatorClass(null);map阶段后，会调用partitioner对输出的list进行分区，每个分区映射一个reduce
		//每个分区又调用job.setSortComparatorClass()来设置对key的排序，若没有通过job.setSortComparatorClass()来设置比较类s，则使用key的compareTo()方法排序
		
		FileInputFormat.addInputPath(job, new Path(arg[0]));
		FileOutputFormat.setOutputPath(job, new Path(arg[1]));
		System.exit(job.waitForCompletion(true)?0:1);
	}
	
	
	/**
	 * 创建新主键类（也就是自定义一种输入输出类型），把第一列整数和第二列整数作为类的属性，并实现WritableComparable接口
	 * 如Text封装了String类型，IntWritable封装了int类型，LongWritable封装了long类型
	 * @author zzd
	 *MR中所有的key和value都要实现Writable接口，用于读（反序列化）和写（序列化）、传输、排序等
	 *在MapReduce的过程中，需要对key进行排序，而key也需要在网络流中传输，因此需要实现WritableComparable，因此key实现了Writable, Comparable两个接口。
	 */
	public static class IntPair implements WritableComparable<IntPair>{
		private int first=0;
		private int second=0;
		
		//get,set方法
		public void set(int left,int right){
			first=left;
			second=right;
		}
		public int getFirst(){
			return first;
		}
		public int getSecond(){
			return second;
		}
		
		//@Override实现接口中方法不用加，重写父类方法时可以加，jdk1.6之后无论是实现接口还是重写方法都可以加
		public void readFields(DataInput in) throws IOException {//读
			first=in.readInt();
			second=in.readInt();
		}
		
		public void write(DataOutput out) throws IOException {//写
			out.writeInt(first);
			out.writeInt(second);
		}

		public int compareTo(IntPair arg) {//如果用户没有通过job.setSortComparatorClass()来设置排序类，则默认调用这个方法来对key排序
			if(first!=arg.first){//先比较第一个数，然后比较第二个数 如(20,21)排在（20,22）前面
				return first-arg.first;//正数表示大，负数表示小，零表示相等，MR一定会根据这个差值来排序的。。。
			}
			else if(second!=arg.second){
				return second-arg.second;
			}
			else return 0;
		}
		
	}
	
	/**
	 * 分组类
	 * 在reduce中，一个key对应一个value迭代器，在这之前，要用到分组，也相当于归并。
	 * 如果比较的两个key相同，那么他们就属于一组没他们的value就在同一个迭代器里
	 * 通过job.setGroupingComparatorClass()来指定分组类
	 * 可以不用指定分组类，因为MR有默认的分组操作，也就是文件归并，归并时会将具有相同的key的多个value归并成一个迭代器 <"a",1>,<"a",2>---><"a",<1,2>>
	 * @author hadoop
	 *
	 */
	public static class GroupingComparator implements RawComparator<IntPair>{

		public int compare(IntPair arg0, IntPair arg1) {
			int first=arg0.getFirst();//分组时只比较key
			int second=arg1.getSecond();
			return second-first;
		}

		public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3,
				int arg4, int arg5) {
			return WritableComparator.compareBytes(arg0, arg1, Integer.SIZE/8, arg3, arg4, Integer.SIZE/8);//不知道啥意思
		}
		
	}
	
	
	public static class MyMapper extends Mapper<LongWritable, Text, IntPair, IntWritable>{
		private final IntPair key=new IntPair();
		private final IntWritable value=new IntWritable();
		@Override
		public void map(LongWritable inKey,Text inValue,Context context) throws IOException, InterruptedException{
			StringTokenizer st=new StringTokenizer(inValue.toString());
			if(st.hasMoreTokens()){
				int left=0;
				int right=0;
				left=Integer.parseInt(st.nextToken());
				if(st.hasMoreTokens()){
					right=Integer.parseInt(st.nextToken());
				}
				key.set(left, right);
				value.set(right);
				context.write(key, value);//map输出：<IntPair（两个数），第二个数>
				System.out.println("map 输出："+key+"\t"+value);
			}
		}
	}
	
	public static class MyReducer extends Reducer<IntPair,IntWritable,Text,IntWritable>{
		private static final Text Hr=new Text("-----------------");
		private final Text first=new Text();
		
		public void reduce(IntPair key,Iterable< IntWritable> values,Context context) throws IOException, InterruptedException{
			//reduce的输入都是对key排好序的
			context.write(Hr, null);//输出分割线
			first.set(Integer.toString(key.getFirst()));
			for(IntWritable iw:values){
				context.write(first, iw);//reduce输出： <第一个数，第二个数>
				System.out.println("reduce 输出："+first+"\t"+iw);
			}
		}
	}
}

注：个人觉得这里的分组类不用去写，因为在shuffle阶段会进行归并操作。

方法二：

这是自己想出的一种办法，将reduce端的输入数据全部存储在一个linkedmap中，在最后一次的reduce函数中，将map中数据取出来，因为reduce输入为第一个数，框架已经对key做了默认排序，所以这是只要对value排序，然后输出到文件中即可，

package SecondrySort_examples;


import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import com.sun.xml.bind.v2.runtime.unmarshaller.XsiNilLoader.Array;


//二次排序
public class SecondrySort {
	private static int flag=0;//输入文件中的总行数，也是map处理的文件行数
	private static int count=0;//记录处理次数
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf=new Configuration();
		conf.set("mapred.jpb.tracker", "localhost:9001");
		String arg[]=new GenericOptionsParser(conf, new String[]{"/secondrysort_in","/secondrysort_out"}).getRemainingArgs();
		if(arg.length!=2){
			System.err.println("in out file error!");
			System.exit(1);
		}
		FileSystem fs=FileSystem.get(conf);
		fs.delete(new Path(arg[1]), true);
		Job job=new Job(conf,"SecondrySort_job");
		job.setMapperClass(SecondrySortMapper.class);
		job.setReducerClass(SecondrySortReducer.class);
		job.setJarByClass(SecondrySort.class);
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(arg[0]));
		FileOutputFormat.setOutputPath(job, new Path(arg[1]));
		System.exit(job.waitForCompletion(true)?0:1);
	}
	
	public static class SecondrySortMapper extends Mapper<Object, Text, IntWritable, IntWritable>{
		static{//读取文件中有几行
			String filePath="hdfs://localhost:9000/secondrysort_in/secondrysort.txt";
			try {
				FileSystem fs=FileSystem.get(URI.create("hdfs://localhost:9000"),new Configuration());
				FSDataInputStream read = fs.open(new Path(filePath));
				InputStreamReader in=new InputStreamReader(read);//转换成字符流
				BufferedReader br=new BufferedReader(in);//包装
				while(br.readLine()!=null){
					flag++;
				}
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		private static IntWritable outKey=new IntWritable();
		private static IntWritable outValue=new IntWritable();
		public void map(Object key,Text value,Context context) throws IOException, InterruptedException{
					String val=value.toString();
					StringTokenizer st=new StringTokenizer(val);
					
					while(st.hasMoreTokens()){
						int first=Integer.parseInt(st.nextToken());
						int second=Integer.parseInt(st.nextToken());
						outKey.set(first);
						outValue.set(first+second);
					}
					context.write(outKey,outValue);//输出为<first,first+second>
					System.out.println("map 输出："+outKey+"\t"+outValue);
		}
	}	
	public static class SecondrySortReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
		private static Map<Integer,String> map=new LinkedHashMap<Integer,String>();//存放第一个数+第二个数 不能用hashmap，它的存放顺序并不是按照你存放的顺序来的，是乱序的，所以遍历的时候不能按照插入的顺序来遍历
		private static IntWritable outValue=new IntWritable();
		public void reduce(IntWritable key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException{
					int one=key.get();//第一个数
					String two="";//因为map的key不能重复，所以将第二个数字进行拼接
					for(IntWritable iw:values){
						count++;
						int t=iw.get()-one;//第二个数
						two+=t+":";
					}
					two=two.substring(0, two.length()-1);//去掉最后一个多出来的:
					System.out.println("map中存入："+one+"\t"+two);
					map.put(one, two);
					if(count==flag){//最后一次的时候进行输出 注意reduce处理次数要小于等于map，因为map后的结果做了归并
						//遍历map
						for(Map.Entry<Integer, String> e:map.entrySet()){//获取key-value映射set
							int a=e.getKey();//第一个数
							String[] bs = e.getValue().split(":");//第二个数的数组
							Arrays.sort(bs);//从小到大排序
							for(String i:bs){
								System.out.println("遍历map："+a+"\t"+i);
								context.write(new IntWritable(a), new IntWritable(Integer.parseInt(i)));//写入文件
							}
						}
					}
		}
	}
}

注意好map、reduce的输入输出，MR程序还是蛮复杂的。

zzd0007

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
mapreduce 二次排序的两种方法

什么是二次排序？现在有一个文件secondrysort_in.txt,20 2150 5150 5250 5360 5160 5360 5270 557 8现在你对文件进行排序，按照第一个数从小到大排列，若果第一个数相同，则按照第二个数从小到大排列。输出文件secondrysort_out7 820 2150 515
复制链接

扫一扫

专栏目录