想弄懂hadoop的二级排序,需要对hadoop的shufle过程非常清楚。
(注:在reduce接受数据阶段,setSortComparatorClass是对key进行的排序处理,setGroupingComparatorClass是对相同值的value进行排序处理)
下面是一个hadoop的二级排序案例:
编写mapReduce程序,将下面的左边内容进行排序,排序后变成右边的样子。
mapReduce的程序如下:
第一步:定义一个组合键的bean
package com.bigdata.demo15_two_class_paixu;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
/**
* 自定义组合键,用于map阶段的sort小阶段
* @author Administrator
* 2018年5月31日上午8:16:38
*/
public class CombinationKey implements WritableComparable<CombinationKey>{
private String firstKey;
private Integer secondKey;
public String getFirstKey() {
return firstKey;
}
public void setFirstKey(String firstKey) {
this.firstKey = firstKey;
}
public Integer getSecondKey() {
return secondKey;
}
public void setSecondKey(Integer secondKey) {
this.secondKey = secondKey;
}
public void write(DataOutput out) throws IOException {
out.writeUTF(this.firstKey);
out.writeInt(this.secondKey);
}
public void readFields(DataInput in) throws IOException {
this.firstKey=in.readUTF();
this.secondKey=in.readInt();
}
public int compareTo(CombinationKey o) {
return this.firstKey.compareTo(o.getFirstKey());
}
}
第二步:自定义一个比较器,对key进行排序时使用
package com.bigdata.demo15_two_class_paixu;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 自定义比较器
* @author Administrator
* 2018年5月31日上午8:40:58
*/
public class DefineCompparator extends WritableComparator{
protected DefineCompparator() {
super(CombinationKey.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
CombinationKey ck1=(CombinationKey) a;
CombinationKey ck2=(CombinationKey) b;
int cp1 = ck1.getFirstKey().compareTo(ck2.getFirstKey());
if(cp1!=0) {
//结束排序
return cp1;
}else {
return ck1.getSecondKey()-ck2.getSecondKey();
}
}
}
第三步:自定义一个分区器,在shuffle阶段使用
package com.bigdata.demo15_two_class_paixu;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* 自定义分区
* @author Administrator
* 2018年5月31日上午8:20:58
*/
public class DefinedPartition extends Partitioner<CombinationKey, IntWritable>{
/**
* @param key map输出,这里根据组合键的第一个值进行分区
* @param value map输出的key
* @param numPartitions 分区总数,即reduce的个数
*/
@Override
public int getPartition(CombinationKey key, IntWritable value, int numPartitions) {
return (key.getFirstKey().hashCode()&Integer.MAX_VALUE)%numPartitions;
}
}
第四步:自定义一个分区器,在shuffle阶段使用
package com.bigdata.demo15_two_class_paixu;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* 自定义分区
* @author Administrator
* 2018年5月31日上午8:20:58
*/
public class DefinedPartition extends Partitioner<CombinationKey, IntWritable>{
/**
* @param key map输出,这里根据组合键的第一个值进行分区
* @param value map输出的key
* @param numPartitions 分区总数,即reduce的个数
*/
@Override
public int getPartition(CombinationKey key, IntWritable value, int numPartitions) {
return (key.getFirstKey().hashCode()&Integer.MAX_VALUE)%numPartitions;
}
}
第五步:编写mapReduce程序
package com.bigdata.demo15_two_class_paixu;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SecondSortMapReduce {
/**
* 使用内部类的形式,定义mapper程序
* @author Administrator
* 2018年5月31日上午11:06:30
*/
static class SecondSortMapper extends Mapper<LongWritable, Text, CombinationKey, IntWritable>{
String[] split=null;
CombinationKey kv=new CombinationKey();
IntWritable v=new IntWritable();
@Override
protected void map(LongWritable key, Text value,
Context context)
throws IOException, InterruptedException {
split = value.toString().split(" ");
kv.setFirstKey(split[0]);
int vv = Integer.parseInt(split[1]);
v.set(vv);
kv.setSecondKey(vv);
context.write(kv, v);
}
}
/**
* 使用内部类的形式,定义reduce程序
* @author Administrator
* 2018年5月31日上午11:06:51
*/
static class SecondSortReducer extends Reducer<CombinationKey, IntWritable, Text, Text>{
Text k=new Text();
Text v=new Text();
@Override
protected void reduce(CombinationKey first_second, Iterable<IntWritable> seconds,
Context context)
throws IOException, InterruptedException {
StringBuilder sb=new StringBuilder();
for(IntWritable second:seconds) {
sb.append(second.get()+",");
}
k.set(first_second.getFirstKey());
v.set(sb.toString().substring(0, sb.toString().length()-1));
context.write(k, v);
}
}
/**
* 主函数
* @param args
* @throws IOException
* @throws ClassNotFoundException
* @throws InterruptedException
*/
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf=new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SecondSortMapReduce.class);
job.setMapperClass(SecondSortMapper.class);
job.setReducerClass(SecondSortReducer.class);
//设置分区和reduce数目
job.setPartitionerClass(DefinedPartition.class);
job.setNumReduceTasks(1);
//设置自定义的分组策略
job.setGroupingComparatorClass(DefinedGroupSort.class);
//设置自定义的比较策略
job.setSortComparatorClass(DefineCompparator.class);
job.setMapOutputKeyClass(CombinationKey.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//设置输入数据
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
第六步:在hadoop集群上运行
将程序打包,放到hadoop集群,运行
[hadoop@mini02 ~]$ hadoop jar \
> ./jars/Review06_hdfs-0.0.1-SNAPSHOT.jar \
> com.bigdata.demo15_two_class_paixu.SecondSortMapReduce \
> /paixu/input01 \
> /paixu/output03
查看排序结果:
[hadoop@mini03 ~]$ hdfs dfs -cat /paixu/output02/part-r-00000
hadoop 23,32,342
hive 12,42,204,2345
spark 3,16,349