MapReduce实现TopK

Hadoop inaction上面的练习。

刚开始考虑,每个分片应该是在所有的记录处理完之后,把这个分片的前K大输出给reduce,但是map函数是每个记录调用一遍,把怎么在所有记录调用完之后处理呢?

  • setup(),此方法被MapReduce框架仅且执行一次,在执行Map任务前,进行相关变量或者资源的集中初始化工作。若是将资源初始化工作放在方法map()中,导致Mapper任务在解析每一行输入时都会进行资源初始化工作,导致重复,程序运行效率不高!
  • cleanup(),此方法被MapReduce框架仅且执行一次,在执行完毕Map任务后,进行相关变量或资源的释放工作。若是将释放资源工作放入方法map()中,也会导致Mapper任务在解析、处理每一行文本后释放资源,而且在下一行文本解析前还要重复初始化,导致反复重复,程序运行效率不高!

所以就是重写cleanup,于是问题就解决了,TreeMap用来存前K大

import java.io.IOException;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.server.namenode.FileDataServlet;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.sun.xml.internal.org.jvnet.fastinfoset.VocabularyApplicationData;

import sun.launcher.resources.launcher;

public class TopK extends Configured implements Tool {

	public final static Integer K=10;
	public static class MapClass extends Mapper<LongWritable, Text, IntWritable, Text>{
		
		TreeMap<Integer, String> map=new TreeMap<Integer,String>();
		public void map(LongWritable key,Text value,Context context)throws IOException,InterruptedException{
			
			String fields []=value.toString().split(",");
			String city=fields[4];
			if(Pattern.matches("\\d+", fields[8])){
				Integer num=Integer.parseInt(fields[8]);
				map.put(num, city);
				if(map.size()>K){
					map.remove(map.firstKey());
				}
			}
		}
		@Override
		protected void cleanup(Mapper<LongWritable, Text, IntWritable, Text>.Context context)throws IOException,InterruptedException{
			
			for(Integer num:map.keySet()){
				context.write(new IntWritable(num), new Text(map.get(num)));
			}
		}
	}
	public static class Reduce  extends Reducer<IntWritable, Text, IntWritable, Text>{
		
		TreeMap<Integer, String> map=new TreeMap<Integer,String>();
		public void reduce(IntWritable key,Iterable<Text> value,Context context)throws IOException,InterruptedException{

			for(Text text : value){
				map.put(key.get(), text.toString());
				if(map.size()>K){
					map.remove(map.firstKey());
				}
			}
		}
		@Override
		protected void cleanup(Reducer<IntWritable, Text, IntWritable, Text>.Context context)throws IOException,InterruptedException{
			
			for(Integer num:map.keySet()){
				context.write(new IntWritable(num), new Text(map.get(num)));
			}
			
		}
	}
	@Override
	public int run(String[] arg0) throws Exception,InterruptedException {
		// TODO Auto-generated method stub
		
		Configuration configuration=getConf();
		Job job=new Job(configuration,"MyJob");
		
		job.setJarByClass(TopK.class);
		
		FileInputFormat.setInputPaths(job, new Path(arg0[0]));
		FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
		
		job.setMapperClass(MapClass.class);
		job.setCombinerClass(Reduce.class);
		job.setReducerClass(Reduce.class);
		
		//job.setInputFormatClass(TextInputFormat.class);
		//job.setOutputFormatClass(TextOutputFormat.class);
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(Text.class);
		System.exit(job.waitForCompletion(true)?0:1);
		
		return 0;
	}

	public static void main(String[] args)throws Exception {
		// TODO Auto-generated method stub
		
		int res=ToolRunner.run(new Configuration(),new TopK(),args);
		System.exit(res);

	}

}



  • 3
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值