Hadoop inaction上面的练习。
刚开始考虑,每个分片应该是在所有的记录处理完之后,把这个分片的前K大输出给reduce,但是map函数是每个记录调用一遍,把怎么在所有记录调用完之后处理呢?
- setup(),此方法被MapReduce框架仅且执行一次,在执行Map任务前,进行相关变量或者资源的集中初始化工作。若是将资源初始化工作放在方法map()中,导致Mapper任务在解析每一行输入时都会进行资源初始化工作,导致重复,程序运行效率不高!
- cleanup(),此方法被MapReduce框架仅且执行一次,在执行完毕Map任务后,进行相关变量或资源的释放工作。若是将释放资源工作放入方法map()中,也会导致Mapper任务在解析、处理每一行文本后释放资源,而且在下一行文本解析前还要重复初始化,导致反复重复,程序运行效率不高!
所以就是重写cleanup,于是问题就解决了,TreeMap用来存前K大
import java.io.IOException;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.server.namenode.FileDataServlet;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.sun.xml.internal.org.jvnet.fastinfoset.VocabularyApplicationData;
import sun.launcher.resources.launcher;
public class TopK extends Configured implements Tool {
public final static Integer K=10;
public static class MapClass extends Mapper<LongWritable, Text, IntWritable, Text>{
TreeMap<Integer, String> map=new TreeMap<Integer,String>();
public void map(LongWritable key,Text value,Context context)throws IOException,InterruptedException{
String fields []=value.toString().split(",");
String city=fields[4];
if(Pattern.matches("\\d+", fields[8])){
Integer num=Integer.parseInt(fields[8]);
map.put(num, city);
if(map.size()>K){
map.remove(map.firstKey());
}
}
}
@Override
protected void cleanup(Mapper<LongWritable, Text, IntWritable, Text>.Context context)throws IOException,InterruptedException{
for(Integer num:map.keySet()){
context.write(new IntWritable(num), new Text(map.get(num)));
}
}
}
public static class Reduce extends Reducer<IntWritable, Text, IntWritable, Text>{
TreeMap<Integer, String> map=new TreeMap<Integer,String>();
public void reduce(IntWritable key,Iterable<Text> value,Context context)throws IOException,InterruptedException{
for(Text text : value){
map.put(key.get(), text.toString());
if(map.size()>K){
map.remove(map.firstKey());
}
}
}
@Override
protected void cleanup(Reducer<IntWritable, Text, IntWritable, Text>.Context context)throws IOException,InterruptedException{
for(Integer num:map.keySet()){
context.write(new IntWritable(num), new Text(map.get(num)));
}
}
}
@Override
public int run(String[] arg0) throws Exception,InterruptedException {
// TODO Auto-generated method stub
Configuration configuration=getConf();
Job job=new Job(configuration,"MyJob");
job.setJarByClass(TopK.class);
FileInputFormat.setInputPaths(job, new Path(arg0[0]));
FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
job.setMapperClass(MapClass.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
//job.setInputFormatClass(TextInputFormat.class);
//job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true)?0:1);
return 0;
}
public static void main(String[] args)throws Exception {
// TODO Auto-generated method stub
int res=ToolRunner.run(new Configuration(),new TopK(),args);
System.exit(res);
}
}