需要二次排序的原因:mapreduce架构自动对映射器生成的键进行排序,即归约器启动之前,所有键是有序的,但是值是随机的,二次排序指的是对值进行排序。归约器输入形如:,即一个key对应多个值,这些值是无序的,排序后得到有序的值,如下:
其中,S按照升序或者降序排列
归约器对于二次排序的两种解决方案:
1.让归约器读取和缓存给定键的所有值,完成归约器中排序,特点是不可伸缩,依赖归约器内存
2.使用mapreduce框架对归约器值排序,方法是创建组合键,例如,A是键,B和C是值,选择B作为次键,这样A和B作为组合键,day作为值,将排序交给MapReduce框架完成,这样不用在内存中排序,是可伸缩的方案
定制插件:
1.分区器:根据映射器的输出键决定将哪个映射器的输出发送到哪个归约器,其本质是利用一致性hash算法
2.比较器:按照自然键对一个归约器中的数据分组,代码如下:
以year,month,temperature为例,MapReduce框架对于二次排序整体的处理流程是:
1.映射器创建对,其中K是组合键,V是temperature的值,组合键的部分是自然键
2.通过分区器插件,将所有自然键发送给同一个归约器
3.通过分组比较器,保证温度按照顺序到达归约器
显然,MapReduce框架完成了排序,而不用在内存中操作
代码如下,除了mapper,reducer,主作业流程以外,还有其余3个文件,一个分区器、一个比较器、一个中间件类:
DateTemperatureGroupingComparator.java分组比较器:
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class DateTemperatureGroupingComparator extends WritableComparator {
public DateTemperatureGroupingComparator() {
/* 调用父类构造函数 */
super(DateTemperaturePair.class, true);
}
@Override
/* 决定输出键和归约器的对应关系,保证相同键发送到同一个归约器 */
public int compare(WritableComparable a, WritableComparable b) {
DateTemperaturePair pair1 = (DateTemperaturePair) a;
DateTemperaturePair pair2 = (DateTemperaturePair) b;
return pair1.getYearMonth().compareTo(pair2.getYearMonth());
}
}
DateTemperaturePartitioner.java分区器:
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class DateTemperaturePartitioner extends
Partitioner<DateTemperaturePair, Text> {
@Override
/* 入参是mapper的输出键和输出值的类型,是一个String类的内置hash算法 */
public int getPartition(DateTemperaturePair dataTemperaturePair, Text text,
int i) {
return Math.abs(dataTemperaturePair.getYearMonth().hashCode() % i);
}
}
DateTemperaturePair.java中间键类
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class DateTemperaturePair
/* java不支持多重继承,使用implements继承接口,不同接口之间用逗号隔开 */
implements Writable, WritableComparable<DateTemperaturePair> {
private String yearMonth; //自然键
private String day;
protected Integer temperature; //次键
/* 用这个方法指出如何对DateTemperaturePair排序 */
public int compareTo(DateTemperaturePair o) {
/* 调用String的字符串比较方法compareTo */
int compareValue = this.yearMonth.compareTo(o.getYearMonth());
if (compareValue == 0) {
compareValue = temperature.compareTo(o.getTemperature());
}
/* 这样实现降序排列 */
return -1 * compareValue;
}
/* DataOutput用于将java基本类型转换成二进制字符流 */
public void write(DataOutput dataOutput) throws IOException {
Text.writeString(dataOutput, yearMonth);
dataOutput.writeInt(temperature);
}
public void readFields(DataInput dataInput) throws IOException {
this.yearMonth = Text.readString(dataInput);
this.temperature = dataInput.readInt();
}
@Override
public String toString() {
return yearMonth.toString();
}
public String getYearMonth() {
return yearMonth;
}
public void setYearMonth(String text) {
this.yearMonth = text;
}
public void setDay(String day) {
this.day = day;
}
public Integer getTemperature() {
return temperature;
}
public void setTemperature(Integer temperature) {
this.temperature = temperature;
}
}
SecondarySortingMapper.java映射器:
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/* 注意输出键的类型是DateTemperaturePair,是自定义的组合键 */
public class SecondarySortingMapper extends
Mapper<LongWritable, Text, DateTemperaturePair, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] tokens = value.toString().split(",");
String yearMonth = tokens[0] + "-" + tokens[1];
String day = tokens[2];
int temperature = Integer.parseInt(tokens[3]);
DateTemperaturePair reduceKey = new DateTemperaturePair();
reduceKey.setYearMonth(yearMonth);
reduceKey.setDay(day);
reduceKey.setTemperature(temperature);
context.write(reduceKey, new IntWritable(temperature));
}
}
SecondarySortingReducer.java归约器:
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/* 输入键和输出键的类型都是自定义的中间键 */
public class SecondarySortingReducer extends
Reducer<DateTemperaturePair, IntWritable, DateTemperaturePair, Text> {
@Override
protected void reduce(DateTemperaturePair key,
Iterable<IntWritable> values, Context context) throws IOException,
InterruptedException {
/* java的字符串变量,在修改场景下运行速度和效率比string高 */
StringBuilder sortedTemperatureList = new StringBuilder();
for (IntWritable temperature : values) {
sortedTemperatureList.append(temperature);
sortedTemperatureList.append(",");
}
sortedTemperatureList.deleteCharAt(sortedTemperatureList.length() - 1);
context.write(key, new Text(sortedTemperatureList.toString()));
}
}
SecondarySort.java主作业流程:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class SecondarySort extends Configured implements Tool {
public int run(String[] args) throws Exception {
/* 设置作业,指导hadoop获取jar包 */
Job job = new Job();
job.setJarByClass(SecondarySort.class);
job.setJobName("SecondarySort");
/* 获取input路径和output路径 */
Path inputPath = new Path(args[0]);
Path outputPath = new Path(args[1]);
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
/* 设置mapper的输出键和输出值 */
job.setMapOutputKeyClass(DateTemperaturePair.class);
job.setMapOutputValueClass(IntWritable.class);
/* 设置reducer的输出键和输出值 */
job.setOutputKeyClass(DateTemperaturePair.class);
job.setOutputValueClass(IntWritable.class);
/* 指定要使用的mapper,reducer,分区器,比较器 */
job.setMapperClass(SecondarySortingMapper.class);
job.setReducerClass(SecondarySortingReducer.class);
job.setPartitionerClass(DateTemperaturePartitioner.class);
job.setGroupingComparatorClass(DateTemperatureGroupingComparator.class);
boolean status = job.waitForCompletion(true);
return status ? 0 : 1;
}
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new IllegalArgumentException(
"!!!!!!!!!!!!!! Usage!!!!!!!!!!!!!!: SecondarySort"
+ "<input-path> <output-path>");
}
int returnStatus = ToolRunner.run(new SecondarySort(), args);
System.exit(returnStatus);
}
}
输入文件:
[root@master ~]# hdfs dfs -cat /sample_input.txt
2000,12,04,10
2000,11,01,20
2000,12,02,-20
2000,11,02,30
2000,11,24,-40
2012,12,21,30
2012,12,22,-20
2012,12,23,60
2012,12,24,70
2012,12,25,10
2013,01,22,80
2013,01,23,90
2013,01,24,70
2013,01,20,-10
运行作业命令:
hadoop jar SecondarySort.jar SecondarySort /sample_input.txt output
作业运行结果如下,可以看到已经按照temperature字段降序排序了:
[root@master ~]# hdfs dfs -cat output/*
2013-01 90,80,70,-10
2012-12 70,60,30,10,-20
2000-12 10,-20
2000-11 30,20,-40