通过treeset 取出TOP N的数据,下面的程序是有缺陷的,因为tree set本身不支持相同数据。
另外使用了cleanup方法, setup 和 cleanup在mapreduce只会跑一次, 从 input读取数据之后,map默认按照行来一行一行读取,也就是循环的,直到读完数据,所以一些初始化工作可以放到setup里去做, cleanup用来清理一些变量,既然执行一次,那我就通过在cleanup里使用方法去除多余数据,只去我要的TOP N,再发送给reduce, 实际这个程序要不要reduce无所谓。
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.commons.net.nntp.NewsgroupInfo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SplitLineReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TopTenOrder {
public static class TokenizerMapper extends Mapper<Object, Text, NullWritable, IntWritable> {
private TreeSet<Integer> top10 = new TreeSet<Integer>();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
top10.add(Integer.parseInt(value.toString()));
}
public void cleanup(Context context) throws IOException, InterruptedException {
while(top10.size() > 10){
top10.remove(top10.first());
}
Iterator iterator = top10.iterator();
while(iterator.hasNext()){
context.write(NullWritable.get(), new IntWritable(Integer.parseInt(iterator.next().toString())));
}
}
}
public static class IntSumReducer extends Reducer<NullWritable, IntWritable, NullWritable, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> value, Context context)
throws IOException, InterruptedException {
for(IntWritable val : value){
context.write(NullWritable.get(), val);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "TopTenOrder");
job.setJarByClass(TopTenOrder.class);
job.setMapperClass(TokenizerMapper.class);
// job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(1);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}