MapReduce-TopK

14 篇文章 0 订阅
13 篇文章 5 订阅
上一篇博客中简单的写了一个MapReduce的程序,其中只重写了map()和reduce()方法,但里面还有cleanup(Context context),
setup(Context context)和run(Context context)方法可以可以重写
这一个实例,我们就说明一下cleanup(Context context),setup(Context context)这两个方法的作用以及在本例中的用法,
以及TopK的MapReduce的写法,对于TopK这里将根据实际情况用种方法来实现,一种是只关注TopK的数值,另一种情况是不但

关心TopK的数值,而且还要输出TopK的相关信息

首先讲解setup(Context context)和cleanup(Context context)的作用
以下是hadoop1.2.1中截取换源码,位于org.apache.hadoop.mapreduce.Mapper类中

/**
   * Called once at the beginning of the task.
   */
  protected void setup(Context context ) throws IOException, InterruptedException {
    // NOTHING
  }
  /**
   * Called once at the end of the task.
   */
  protected void cleanup(Context context ) throws IOException, InterruptedException {
    // NOTHING
  }
由注释可知道setup()方法是在任务开始之前调用一次,cleanup()方法则是在任务结束之后调用一次
所以这两个方法可以用来对初始的变量进行赋值,或文件的分发,以及任务执行完成过后对资源的释放
在本例中我们会用这两个方法来做TopK中K值的指定。在以后的博客中我们还会有其它的用法。


以下的代码是Mapper类中的一个重要的方法,从run()可以看到任务提交后首先执行setup(context)方法
然后通过判断是否还有下一个值,不停的调用map()方法,最后在调用cleanup(context)方法,这就是一Map执行的顺序
这里不详细讲解nextKeyValue()方法,后面在讲解RecordReader的定制的博客中会做详细的讲解,参见 MapReduce-xml文件的处理-定制InputFormat及定制RecordReader

 /**
   * Expert users can override this method for more complete control over the
   * execution of the Mapper.
   * @param context
   * @throws IOException
   */
  public void run(Context context) throws IOException, InterruptedException {
    setup(context);
    try {
    	//map通过这里反复调用RecordReader的方法
    	while (context.nextKeyValue()) {
    	//context.getCurrentKey()在MapContext的方法中调用相关RecordReader的方法
    		/**
    		 * @Override
  		 *		public KEYIN getCurrentKey() throws IOException, InterruptedException {
    		 *			return reader.getCurrentKey();
  		 *		}
    		 */
        map(context.getCurrentKey(), context.getCurrentValue(), context);
      }
    } finally {
      cleanup(context);
    }
  }
接下来我们开始写TopK的第一个情况的程序
测试数据:
uid,name,cost
1,mr1,3234
2,mr2,123
3,mr3,9877
4,mr4,348
5,mr5,12345
6,mr6,6646
7,mr7,98
8,mr8,12345
现在要提取出消费最高的金额前3的是多少
map阶段:
import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class TopKMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
	
	int len;
	int[] tmp;

	/**
	 * Map任务启动的时候调用
	 */
	@Override
	protected void setup( Context context)
			throws IOException, InterruptedException {
		/**
		 * 通过context获取任务启动时传入的TopK的K值
		 */
		len = context.getConfiguration().getInt("K", 10);
		/**
		 * 创建一个数组来存放topK的值
		 */
		tmp = new int[len + 1];
	}

	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String line = value.toString();
		String[] arr = line.split(",");
		if(arr.length == 3) {
			addParm(Integer.valueOf(arr[2]));
		}
	}
	/**
	 * 把新取出的数值放到数组中的第一位,并按从小到大排序
	 * 这样下一次放入时就把最小的数值去除掉了
	 */
	private void addParm(int parm) {
		tmp[0] = parm;
		Arrays.sort(tmp);
	}
	
	@Override
	protected void cleanup(Context context)
			throws IOException, InterruptedException {
		/**
		 * 把数组中的数值倒序的输出,这样每个Map就输出了自己的TOPK去reduce端
		 */
		for( int i = 1 ; i <= len ; i ++) {
			context.write(new IntWritable(tmp[i]), new IntWritable(tmp[i]));
		}
	}
}
reduce阶段:

import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class TopKReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
	int len;
	int[] tmp;
	
	@Override
	protected void setup( Context context)
			throws IOException, InterruptedException {
		len = context.getConfiguration().getInt("K", 10);
		tmp = new int[len + 1];
	}

	@Override
	protected void reduce(IntWritable key, Iterable<IntWritable> values, Context arg2)
			throws IOException, InterruptedException {
		for(IntWritable value: values) {
			addParm(value.get());
		}
	}

	private void addParm(int parm) {
		tmp[0] = parm;
		Arrays.sort(tmp);
	}
	
	@Override
	protected void cleanup( Context context)
			throws IOException, InterruptedException {
		for( int i = len; i > 0 ; i --) {
			context.write(new IntWritable(len - i + 1), new IntWritable(tmp[i]));
		}
	}
}
启动函数:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobMain {
	public static void main(String[] args) throws Exception {
		Configuration configuration = new Configuration();
		/**
		 * 把传入参数放入Configuration中,map或reduce中可以通过
		 * 获取Configuration来获取传入的参数,这是hadoop传入参数的
		 * 方式之一
		 */
		configuration.set("K", args[2]);
		Job job = new Job(configuration, "topK-job");
		job.setJarByClass(JobMain.class);
		job.setMapperClass(TopKMapper.class);
		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setReducerClass(TopKReducer.class);
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		Path outputDir = new Path(args[1]);
		FileSystem fs = FileSystem.get(configuration);
		if(fs.exists(outputDir)) {
			fs.delete(outputDir, true);
		}
		FileOutputFormat.setOutputPath(job, outputDir);
		System.exit(job.waitForCompletion(true)? 0: 1);
	}
}


测试结果:


测试数据:
uid,name,cost
1,mr1,3234
2,mr2,123
3,mr3,9877
4,mr4,348
5,mr5,12345
6,mr6,6646
7,mr7,98
8,mr8,12345
现在要提取出消费最高的金额前3的name加金额

map阶段:

import java.io.IOException;
import java.util.Map.Entry;
import java.util.Comparator;
import java.util.Set;
import java.util.TreeMap;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class TopKTreeMapper extends
		Mapper<LongWritable, Text, IntWritable, Text> {

	private int len;
	private TreeMap<Integer, String> tmp;
	private IntWritable key = new IntWritable();
	private Text value = new Text();
	/**
	 * Map任务启动的时候调用
	 */
	@Override
	protected void setup(Context context) throws IOException,
			InterruptedException {
		/**
		 * 通过context获取任务启动时传入的TopK的K值
		 */
		len = context.getConfiguration().getInt("K", 10);
		tmp = new TreeMap<Integer, String>(new Comparator<Integer>() {

			@Override
			public int compare(Integer o1, Integer o2) {
				return o2-o1;
			}
		});
	}

	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String line = value.toString();
		String[] arr = line.split(",");
		if (arr.length == 3) {
			addParm(Integer.valueOf(arr[2]), arr[1]);
		}
	}

	/**
	 * 把新取出的数值放到数组中的第一位,并按从小到大排序 这样下一次放入时就把最小的数值去除掉了
	 */
	private void addParm(int parm, String name) {
		tmp.put(parm, name);
		if (tmp.size() > len) {
			tmp.remove(tmp.lastKey());
		}
	}

	@Override
	protected void cleanup(Context context) throws IOException,
			InterruptedException {
		Set<Entry<Integer, String>> set = tmp.entrySet();
		for (Entry<Integer, String> entry : set) {
			key.set(entry.getKey());
			value.set(entry.getValue());
			context.write(key, value);
		}
	}
}
reduce阶段:

import java.io.IOException;
import java.util.Map.Entry;
import java.util.Comparator;
import java.util.Set;
import java.util.TreeMap;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TopKTreeReducer extends Reducer<IntWritable, Text, Text, IntWritable> {
	int len;
	private TreeMap<Integer, String> tmp;
	private IntWritable key = new IntWritable();
	private Text value = new Text();
	
	@Override
	protected void setup( Context context)
			throws IOException, InterruptedException {
		len = context.getConfiguration().getInt("K", 10);
		tmp = new TreeMap<Integer, String>(new Comparator<Integer>() {
			@Override
			public int compare(Integer o1, Integer o2) {
				return o2-o1;
			}
		});
	}

	@Override
	protected void reduce(IntWritable key, Iterable<Text> values, Context arg2)
			throws IOException, InterruptedException {
		for(Text value : values) {
			addParm(key.get(), value.toString());
		}
	}

	private void addParm(int parm, String name) {
		tmp.put(parm, name);
		if (tmp.size() > len) {
			tmp.remove(tmp.lastKey());
		}
	}
	
	@Override
	protected void cleanup( Context context)
			throws IOException, InterruptedException {
		Set<Entry<Integer, String>> set = tmp.entrySet();
		for (Entry<Integer, String> entry : set) {
			key.set(entry.getKey());
			value.set(entry.getValue());
			context.write(value, key);
		}
	}
}
启动函数:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobMainTree {
	public static void main(String[] args) throws Exception {
		Configuration configuration = new Configuration();
		/**
		 * 把传入参数放入Configuration中,map或reduce中可以通过
		 * 获取Configuration来获取传入的参数,这是hadoop传入参数的
		 * 方式之一
		 */
		configuration.set("K", args[2]);
		Job job = new Job(configuration, "topK-job-tree");
		job.setJarByClass(JobMainTree.class);
		job.setMapperClass(TopKTreeMapper.class);
		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(Text.class);
		job.setReducerClass(TopKTreeReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		Path outputDir = new Path(args[1]);
		FileSystem fs = FileSystem.get(configuration);
		if(fs.exists(outputDir)) {
			fs.delete(outputDir, true);
		}
		FileOutputFormat.setOutputPath(job, outputDir);
		System.exit(job.waitForCompletion(true)? 0: 1);
	}
}
运行结果:



结论:

如果仔细观察结果会发现,第二次运行结果没有重复的最大值出现,重复的值只出现了一次而且是最新进入TreeMap的值,这是因为TreeMap的属性的问题,这里我们不做过多的讨论,这篇博客只是熟悉Mapper类中可重写方法的用途。下一篇博客讲解自定义输入类型,并解决这一篇遗留的即要取name字段又要取cost字段,但还要保留做重复金额的问题。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值