1、输入数据:
[hadoop@hadoop ~]$ hdfs dfs -text /user/hadoop/secondarysort.txt
3 5
5 89
7 63
5 56
3 9
3 1
7 26
7 45
7 4
5 18
5 23
7 63
3 24
[hadoop@hadoop ~]$
2、代码:
package secondarySort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class SecondarySort_Demo {
//自己定义的key类应该实现WritableComparable接口
public static class IntPair implements WritableComparable<IntPair>{
int first;
int second;
public void set(int left,int right){
first=left;
second=right;
}
public int getFirst(){
return first;
}
public int getSecond(){
return second;
}
//序列化,将IntPair转化成使用流传送的二进制
public void write(DataOutput out) throws IOException {
out.writeInt(first);
out.writeInt(second);
}
//反序列化,从流中的二进制转换成IntPair
public void readFields(DataInput in) throws IOException {
first=in.readInt();
second=in.readInt();
}
//key的比较
public int compareTo(IntPair o) {
if(first!=o.first){
return first<o.first ? -1:1;
}else if(second!=o.second){
return second<o.second ? -1:1;
}else{
return 0;
}
}
//新定义类应该重写的两个方法
public int hashCode(){
return first*157+second;
}
public boolean equals(Object right){
if(right==null)
return false;
if(this==right)
return true;
if(right instanceof IntPair){
IntPair r=(IntPair) right;
return r.first==first&&r.second==second;
}else {
return false;
}
}
}
/**
* 分区函数类。根据first确定Partition。
*/
public static class FirstPartitioner extends Partitioner<IntPair, IntWritable>{
@Override
public int getPartition(IntPair key, IntWritable value,
int numPartitions) {
return Math.abs(key.getFirst()*127) % numPartitions;
}
}
/**
* 分组函数类。只要first相同就属于同一个组。
*/
/*//第一种方法,实现接口RawComparator
public static class GroupingCpmparator implements RawComparator<IntPair>{
public int compare(IntPair o1, IntPair o2) {
int l=o1.getFirst();
int r=o2.getFirst();
return l == r ? 0:(l<r ? -1:1);
}
//一个字节一个字节的比,直到找到一个不相同的字节,然后比这个字节的大小作为两个字节流的大小比较结果。
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return WritableComparator.compareBytes(b1, s1, Integer.SIZE/8, b2, s2, Integer.SIZE/8);
}
}*/
//第二种方法,继承WritableComparator
public static class GroupingComparator extends WritableComparator{
protected GroupingComparator(){
super(IntPair.class,true);
}
@SuppressWarnings("rawtypes")
//Compare two WritableComparables.
public int compare(WritableComparable w1,WritableComparable w2){
IntPair ip1=(IntPair) w1;
IntPair ip2=(IntPair) w2;
int l=ip1.getFirst();
int r=ip2.getFirst();
return l==r?1:(l<r?-1:1);
}
}
// 自定义map
public static class MyMap extends Mapper<LongWritable, Text, IntPair, IntWritable>{
private final IntPair intkey=new IntPair();
private final IntWritable intvalue=new IntWritable();
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String line=value.toString();
String[] splited=line.split("\t");
intkey.set(Integer.parseInt(splited[0]), Integer.parseInt(splited[1]));
intvalue.set(Integer.parseInt(splited[1]));
context.write(intkey, intvalue);
}
}
// 自定义reduce
public static class MyReduce extends Reducer<IntPair, IntWritable, Text, IntWritable>{
private final Text left =new Text();
// private static final Text SEPARATOR =new Text("========================");
@Override
protected void reduce(IntPair k2, Iterable<IntWritable> v2s,Context context)
throws IOException, InterruptedException {
// context.write(SEPARATOR, null);
left.set(Integer.toString(k2.getFirst()));
for (IntWritable val : v2s) {
context.write(left, val);
}
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
// 实例化一道作业
Job job=Job.getInstance(conf, SecondarySort_Demo.class.getSimpleName());
job.setJarByClass(SecondarySort_Demo.class);
// Mapper类型
job.setMapperClass(MyMap.class);
// 不再需要Combiner类型,因为Combiner的输出类型<Text, IntWritable>对Reduce的输入类型<IntPair, IntWritable>不适用
//job.setCombinerClass(Reduce.class);
// Reducer类型
job.setReducerClass(MyReduce.class);
// 分区函数
job.setPartitionerClass(FirstPartitioner.class);
// 分组函数
job.setGroupingComparatorClass(GroupingComparator.class);
// map 输出Key的类型
job.setMapOutputKeyClass(IntPair.class);
// map输出Value的类型
job.setMapOutputValueClass(IntWritable.class);
// rduce输出Key的类型,是Text,因为使用的OutputFormatClass是TextOutputFormat
job.setOutputKeyClass(Text.class);
// rduce输出Value的类型
job.setOutputValueClass(IntWritable.class);
// 将输入的数据集分割成小数据块splites,同时提供一个RecordReder的实现。
job.setInputFormatClass(TextInputFormat.class);
// 提供一个RecordWriter的实现,负责数据输出。
job.setOutputFormatClass(TextOutputFormat.class);
// 输入hdfs路径
FileInputFormat.setInputPaths(job, args[0]);
// 输出hdfs路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 提交job
// System.exit(job.waitForCompletion(true) ? 0 : 1);
job.waitForCompletion(true);
}
}
3、打包执行命令:
hadoop jar secondarysort.jar /user/hadoop/secondarysort.txt /user/hadoop/output
4、结果输出文件目录:
[hadoop@hadoop ~]$ hdfs dfs -ls /user/hadoop/output
Found 77 items
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/_SUCCESS
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00000
-rw-r--r-- 3 hadoop supergroup 17 2015-08-30 17:22 /user/hadoop/output/part-r-00001
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00002
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00003
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00004
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00005
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00006
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00007
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00008
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00009
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00010
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00011
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00012
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00013
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00014
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00015
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00016
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00017
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00018
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00019
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00020
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00021
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00022
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00023
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00024
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00025
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00026
-rw-r--r-- 3 hadoop supergroup 20 2015-08-30 17:22 /user/hadoop/output/part-r-00027
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00028
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00029
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00030
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00031
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00032
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00033
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00034
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00035
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00036
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00037
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00038
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00039
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00040
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00041
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00042
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00043
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00044
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00045
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00046
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00047
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00048
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00049
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00050
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00051
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00052
-rw-r--r-- 3 hadoop supergroup 24 2015-08-30 17:22 /user/hadoop/output/part-r-00053
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00054
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00055
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00056
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00057
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00058
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00059
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00060
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00061
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00062
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00063
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00064
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00065
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00066
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00067
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00068
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00069
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00070
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00071
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00072
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00073
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00074
-rw-r--r-- 3 hadoop supergroup 0 2015-08-30 17:22 /user/hadoop/output/part-r-00075
5、排序完数据:
[hadoop@hadoop ~]$ hdfs dfs -text /user/hadoop/output/part-r-00001
3 1
3 5
3 9
3 24
[hadoop@hadoop ~]$ hdfs dfs -text /user/hadoop/output/part-r-00027
5 18
5 23
5 56
5 89
[hadoop@hadoop ~]$ hdfs dfs -text /user/hadoop/output/part-r-00053
7 4
7 26
7 45
7 63
7 63
[hadoop@hadoop ~]$