1. 前言
默认情况下, Map会对key自动进行排序,但是有时候需要对key排序的同时还需要对value进行排序,这就是所谓的二次排序。
2. 需求分析
假设现在有如下数据:每行两列,列与列之间的分隔符是制表符(”\t“),输出的结果先按照第一个字段的升序排列,如果第一列的值相等,在按照第二个字段的升序进行排列。
举个栗子:
3 1
3 5
1 3
1 2
2 1
5 1
1 1
3 3
有如上数据,则二次排序后的结果应该为:
1 1
1 2
1 3
------
2 1
------
3 1
3 3
3 5
------
5 1
3. 二次排序的实现原理
- Mapper任务接收输入分片,然后不断地调用map函数,对读取数据进行处理,待处理完毕后,转换为新的键值对进行输出,这里输出的新的键值对为:key = <1, 1>, value = 1
- 对map函数输出的键值对调用分区函数,将数据进行分区。不同分区的数据会被送到不同的Reducer任务中。
- 对于不同分区的数据,会按照key进行排序,这里的key必须实现WritableComparable接口。该接口实现了Comparable接口,因此可以进行比较排序。
- 对于排序后的<key, value>,会按照key进行分组。如果key相同,那么相同的key的<key, value>就被分到一个组中。最终,每个组会调用一次reduce函数。
- 排序、分组后的数据会被送到Reducer节点。
4. 上传文件
一通乱敲:
上传文件:
hadoop fs -put secondsort /secondsort
5. 代码实现
MyKey类:
package com.mapreduce.secondarysort;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class MyKey implements WritableComparable<MyKey> {
private int first = 0;
private int second = 0;
public void set(int first, int second){
this.first = first;
this.second = second;
}
public int getFirst() {
return first;
}
public int getSecond() {
return second;
}
//这是比较的关键,对key进行比较时默认会调用compareTo()方法
@Override
public int compareTo(MyKey o) {
if(first != o.first){
return first - o.first;
}else if(second != o.second){
return second - o.second;
}else{
return 0;
}
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(first);
out.writeInt(second);
}
@Override
public void readFields(DataInput in) throws IOException {
first = in.readInt();
second = in.readInt();
}
@Override
public String toString(){
return "<"+ first + ", "+ second + ">";
}
/*
@Override
public int hashCode(){
return first+"".hashCode()+second+"".hashCode();
}
@Override
public boolean equals(Object right){
if(right instanceof MyKey){
MyKey myKey = (MyKey)right;
return myKey.first == first && myKey.second == second;
}else {
return false;
}
}
*/
}
MyMapper类:
package com.mapreduce.secondarysort;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class MyMapper extends Mapper<LongWritable, Text, MyKey, IntWritable> {
private final MyKey key = new MyKey();
private final IntWritable value = new IntWritable();
@Override
public void map(LongWritable inkey, Text invalue, Context context)
throws IOException, InterruptedException{
String[] strs = invalue.toString().split("\t");
System.out.println(Integer.parseInt(strs[0])+ "\t" + Integer.parseInt(strs[1]));
key.set(Integer.parseInt(strs[0]), Integer.parseInt(strs[1]));
value.set(Integer.parseInt(strs[1]));
System.out.println("MyMappr : ");
System.out.println("key = "+ key + ", value = "+ value);
context.write(key, value);
}
}
GroupingComparator类:
package com.mapreduce.secondarysort;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.io.file.tfile.RawComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class GroupingComparator extends WritableComparator {
public GroupingComparator(){super(MyKey.class, true);}
@Override
public int compare(WritableComparable a, WritableComparable b){
MyKey myKey = (MyKey)a;
MyKey myKey1 = (MyKey)b;
//如果结果为0,则被分配到一个组内,然后每个组调用一次Reducer
return myKey.getFirst() - myKey1.getFirst();
}
}
MyReducer类:
package com.mapreduce.secondarysort;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MyReducer extends Reducer<MyKey, IntWritable, Text, IntWritable> {
private final Text SIGN = new Text("********************");
private final Text first = new Text();
@Override
public void reduce(MyKey key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException{
System.out.println("Reducer : ");
System.out.print("MyKey = "+key.toString() + "values = ");
context.write(SIGN, null);//分组符
first.set(Integer.toString(key.getFirst())); // 获取第一个值传入到key中
for(IntWritable value : values){ // values值会自动进行排序
System.out.print(value+" ");
context.write(first, value);
}
System.out.println();
}
}
SecondarySortApp类:
package com.mapreduce.secondarysort;
import com.mapreduce.wordcount.WordCountMapper;
import com.mapreduce.wordcount.WordCountReducer;
import com.mapreduce.wordcount.WordCountRunJob;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.net.URI;
public class SecondarySortApp {
private static final String INPUT_PATH = "hdfs://master002:9000/secondsort";
private static final String OUTPUT_PATH = "hdfs://master002:9000/outputsecondsort";
public static void main(String[] args) throws Exception{
System.setProperty("HADOOP_USER_NAME", "hadoop");
Configuration conf = new Configuration();
//提升代码的健壮性
final FileSystem fileSystem = FileSystem.get(URI.create(INPUT_PATH), conf);
if(fileSystem.exists(new Path(OUTPUT_PATH))){
fileSystem.delete(new Path(OUTPUT_PATH), true);
}
Job job = Job.getInstance(conf, "SecondarySortApp");
//run jar class 主方法
job.setJarByClass(SecondarySortApp.class);
//设置map
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(MyKey.class);
job.setMapOutputValueClass(IntWritable.class);
//设置reduce
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置Group
job.setGroupingComparatorClass(GroupingComparator.class);
//设置input format
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.addInputPath(job, new Path(INPUT_PATH));
//设置output format
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
//提交job
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
6. 效果截图
hadoop fs -text /outputsecondsort/part-r-00000