思路:文本文件也是一行一个单词这样的,那排序的不同在于,默认的shuffer阶段的排序是根据map阶段传入的key值进行排序,整形那就升序排序,字符串就按位比较acsll码大小,那这样在这里是不行的,因为每个reduce上进行的是自己的排序,而无法保证全局上的排序。所以在分配给reduce的前,需要自定义partition类,将同一个数据区间的key传入到同一个reduce中,那么每个reduce所做的排序工作在整体上也是有序的。问题在于如何进行partition。将整数类型的最大值除以reduce个数,得到一个bound值,通过key与bound值的倍数关系,来确定一个倍数值,return这个值,也就是分配了给特定的reduce中。
文本文件:
data1.txt:
1
2
3
4
55
2
3
1
data2.txt;
11
22
33
12
11
2
4
8
data3.txt:
0
-1
-2
-1
1888
1991919
1
2
123
12
MainClass:
package sort;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 数据排序
*
* @author huangpeng
*
*/
public class MainFunc extends Configured implements Tool {
public static void main(String[] args) throws Exception {
int ret = ToolRunner.run(new MainFunc(), args);
System.exit(ret);
}
@Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
Job job = new Job(getConf());
job.setJarByClass(MainFunc.class);
job.setJobName("datasort");
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setPartitionerClass(Partition.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path("src/sort/data1.txt"),
new Path("src/sort/data2.txt"), new Path("src/sort/data3.txt"));
FileOutputFormat.setOutputPath(job, new Path("rst3"));
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
}
Map:
package sort;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class Map extends Mapper<Object, Text, IntWritable, IntWritable> {
private static IntWritable data = new IntWritable();
@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
data.set(Integer.parseInt(line));
context.write(data, new IntWritable(1));
}
}
Reduce:
package sort;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class Reduce extends
Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
// 行号计数器
private static IntWritable linenum = new IntWritable(1);
protected void reduce(IntWritable key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
for (IntWritable val : values) {
context.write(linenum, key);
linenum = new IntWritable(linenum.get() + 1);
}
}
}
自定义Partition:
package sort;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class Partition extends Partitioner<IntWritable, IntWritable> {
@Override
public int getPartition(IntWritable key, IntWritable value,
int numPartitions) {
int MaxNumber = 65223;
int bound = MaxNumber / numPartitions + 1;
int keynumber = key.get();
for (int i = 0; i < numPartitions; i++) {
if (keynumber >= bound * i && keynumber < bound * (i + 1)) {
return i;
}
}
return 0;
}
}