MapReduce-TopK

最新推荐文章于 2024-05-06 18:24:51 发布

doegoo

最新推荐文章于 2024-05-06 18:24:51 发布

阅读量1.7k

点赞数

分类专栏： hadoop mapreduce MapReduce 文章标签： mapreduce

本文链接：https://blog.csdn.net/doegoo/article/details/50343267

版权

hadoop 同时被 3 个专栏收录

15 篇文章 0 订阅

订阅专栏

mapreduce

14 篇文章 0 订阅

订阅专栏

MapReduce

13 篇文章 5 订阅

订阅专栏

上一篇博客中简单的写了一个MapReduce的程序，其中只重写了map()和reduce()方法，但里面还有cleanup(Context context)，
setup(Context context）和run(Context context)方法可以可以重写
这一个实例，我们就说明一下cleanup(Context context)，setup(Context context）这两个方法的作用以及在本例中的用法，
以及TopK的MapReduce的写法，对于TopK这里将根据实际情况用种方法来实现，一种是只关注TopK的数值，另一种情况是不但

关心TopK的数值，而且还要输出TopK的相关信息

首先讲解setup(Context context）和cleanup(Context context)的作用
以下是hadoop1.2.1中截取换源码，位于org.apache.hadoop.mapreduce.Mapper类中

/**
   * Called once at the beginning of the task.
   */
  protected void setup(Context context ) throws IOException, InterruptedException {
    // NOTHING
  }
  /**
   * Called once at the end of the task.
   */
  protected void cleanup(Context context ) throws IOException, InterruptedException {
    // NOTHING
  }

由注释可知道setup()方法是在任务开始之前调用一次，cleanup()方法则是在任务结束之后调用一次
所以这两个方法可以用来对初始的变量进行赋值，或文件的分发，以及任务执行完成过后对资源的释放
在本例中我们会用这两个方法来做TopK中K值的指定。在以后的博客中我们还会有其它的用法。

以下的代码是Mapper类中的一个重要的方法，从run()可以看到任务提交后首先执行setup(context)方法
然后通过判断是否还有下一个值，不停的调用map()方法，最后在调用cleanup(context)方法，这就是一Map执行的顺序
这里不详细讲解nextKeyValue()方法，后面在讲解RecordReader的定制的博客中会做详细的讲解，参见 MapReduce-xml文件的处理-定制InputFormat及定制RecordReader。

 /**
   * Expert users can override this method for more complete control over the
   * execution of the Mapper.
   * @param context
   * @throws IOException
   */
  public void run(Context context) throws IOException, InterruptedException {
    setup(context);
    try {
    	//map通过这里反复调用RecordReader的方法
    	while (context.nextKeyValue()) {
    	//context.getCurrentKey()在MapContext的方法中调用相关RecordReader的方法
    		/**
    		 * @Override
  		 *		public KEYIN getCurrentKey() throws IOException, InterruptedException {
    		 *			return reader.getCurrentKey();
  		 *		}
    		 */
        map(context.getCurrentKey(), context.getCurrentValue(), context);
      }
    } finally {
      cleanup(context);
    }
  }

接下来我们开始写TopK的第一个情况的程序
测试数据：
uid,name,cost
1,mr1,3234
2,mr2,123
3,mr3,9877
4,mr4,348
5,mr5,12345
6,mr6,6646
7,mr7,98
8,mr8,12345
现在要提取出消费最高的金额前3的是多少
map阶段：

import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class TopKMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
	
	int len;
	int[] tmp;

	/**
	 * Map任务启动的时候调用
	 */
	@Override
	protected void setup( Context context)
			throws IOException, InterruptedException {
		/**
		 * 通过context获取任务启动时传入的TopK的K值
		 */
		len = context.getConfiguration().getInt("K", 10);
		/**
		 * 创建一个数组来存放topK的值
		 */
		tmp = new int[len + 1];
	}

	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String line = value.toString();
		String[] arr = line.split(",");
		if(arr.length == 3) {
			addParm(Integer.valueOf(arr[2]));
		}
	}
	/**
	 * 把新取出的数值放到数组中的第一位，并按从小到大排序
	 * 这样下一次放入时就把最小的数值去除掉了
	 */
	private void addParm(int parm) {
		tmp[0] = parm;
		Arrays.sort(tmp);
	}
	
	@Override
	protected void cleanup(Context context)
			throws IOException, InterruptedException {
		/**
		 * 把数组中的数值倒序的输出，这样每个Map就输出了自己的TOPK去reduce端
		 */
		for( int i = 1 ; i <= len ; i ++) {
			context.write(new IntWritable(tmp[i]), new IntWritable(tmp[i]));
		}
	}
}

reduce阶段：

import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class TopKReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
	int len;
	int[] tmp;
	
	@Override
	protected void setup( Context context)
			throws IOException, InterruptedException {
		len = context.getConfiguration().getInt("K", 10);
		tmp = new int[len + 1];
	}

	@Override
	protected void reduce(IntWritable key, Iterable<IntWritable> values, Context arg2)
			throws IOException, InterruptedException {
		for(IntWritable value: values) {
			addParm(value.get());
		}
	}

	private void addParm(int parm) {
		tmp[0] = parm;
		Arrays.sort(tmp);
	}
	
	@Override
	protected void cleanup( Context context)
			throws IOException, InterruptedException {
		for( int i = len; i > 0 ; i --) {
			context.write(new IntWritable(len - i + 1), new IntWritable(tmp[i]));
		}
	}
}

启动函数：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobMain {
	public static void main(String[] args) throws Exception {
		Configuration configuration = new Configuration();
		/**
		 * 把传入参数放入Configuration中，map或reduce中可以通过
		 * 获取Configuration来获取传入的参数，这是hadoop传入参数的
		 * 方式之一
		 */
		configuration.set("K", args[2]);
		Job job = new Job(configuration, "topK-job");
		job.setJarByClass(JobMain.class);
		job.setMapperClass(TopKMapper.class);
		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setReducerClass(TopKReducer.class);
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		Path outputDir = new Path(args[1]);
		FileSystem fs = FileSystem.get(configuration);
		if(fs.exists(outputDir)) {
			fs.delete(outputDir, true);
		}
		FileOutputFormat.setOutputPath(job, outputDir);
		System.exit(job.waitForCompletion(true)? 0: 1);
	}
}

测试结果：

测试数据：
uid,name,cost
1,mr1,3234
2,mr2,123
3,mr3,9877
4,mr4,348
5,mr5,12345
6,mr6,6646
7,mr7,98
8,mr8,12345
现在要提取出消费最高的金额前3的name加金额

map阶段：

import java.io.IOException;
import java.util.Map.Entry;
import java.util.Comparator;
import java.util.Set;
import java.util.TreeMap;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class TopKTreeMapper extends
		Mapper<LongWritable, Text, IntWritable, Text> {

	private int len;
	private TreeMap<Integer, String> tmp;
	private IntWritable key = new IntWritable();
	private Text value = new Text();
	/**
	 * Map任务启动的时候调用
	 */
	@Override
	protected void setup(Context context) throws IOException,
			InterruptedException {
		/**
		 * 通过context获取任务启动时传入的TopK的K值
		 */
		len = context.getConfiguration().getInt("K", 10);
		tmp = new TreeMap<Integer, String>(new Comparator<Integer>() {

			@Override
			public int compare(Integer o1, Integer o2) {
				return o2-o1;
			}
		});
	}

	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String line = value.toString();
		String[] arr = line.split(",");
		if (arr.length == 3) {
			addParm(Integer.valueOf(arr[2]), arr[1]);
		}
	}

	/**
	 * 把新取出的数值放到数组中的第一位，并按从小到大排序 这样下一次放入时就把最小的数值去除掉了
	 */
	private void addParm(int parm, String name) {
		tmp.put(parm, name);
		if (tmp.size() > len) {
			tmp.remove(tmp.lastKey());
		}
	}

	@Override
	protected void cleanup(Context context) throws IOException,
			InterruptedException {
		Set<Entry<Integer, String>> set = tmp.entrySet();
		for (Entry<Integer, String> entry : set) {
			key.set(entry.getKey());
			value.set(entry.getValue());
			context.write(key, value);
		}
	}
}

reduce阶段：

import java.io.IOException;
import java.util.Map.Entry;
import java.util.Comparator;
import java.util.Set;
import java.util.TreeMap;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TopKTreeReducer extends Reducer<IntWritable, Text, Text, IntWritable> {
	int len;
	private TreeMap<Integer, String> tmp;
	private IntWritable key = new IntWritable();
	private Text value = new Text();
	
	@Override
	protected void setup( Context context)
			throws IOException, InterruptedException {
		len = context.getConfiguration().getInt("K", 10);
		tmp = new TreeMap<Integer, String>(new Comparator<Integer>() {
			@Override
			public int compare(Integer o1, Integer o2) {
				return o2-o1;
			}
		});
	}

	@Override
	protected void reduce(IntWritable key, Iterable<Text> values, Context arg2)
			throws IOException, InterruptedException {
		for(Text value : values) {
			addParm(key.get(), value.toString());
		}
	}

	private void addParm(int parm, String name) {
		tmp.put(parm, name);
		if (tmp.size() > len) {
			tmp.remove(tmp.lastKey());
		}
	}
	
	@Override
	protected void cleanup( Context context)
			throws IOException, InterruptedException {
		Set<Entry<Integer, String>> set = tmp.entrySet();
		for (Entry<Integer, String> entry : set) {
			key.set(entry.getKey());
			value.set(entry.getValue());
			context.write(value, key);
		}
	}
}

启动函数：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobMainTree {
	public static void main(String[] args) throws Exception {
		Configuration configuration = new Configuration();
		/**
		 * 把传入参数放入Configuration中，map或reduce中可以通过
		 * 获取Configuration来获取传入的参数，这是hadoop传入参数的
		 * 方式之一
		 */
		configuration.set("K", args[2]);
		Job job = new Job(configuration, "topK-job-tree");
		job.setJarByClass(JobMainTree.class);
		job.setMapperClass(TopKTreeMapper.class);
		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(Text.class);
		job.setReducerClass(TopKTreeReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		Path outputDir = new Path(args[1]);
		FileSystem fs = FileSystem.get(configuration);
		if(fs.exists(outputDir)) {
			fs.delete(outputDir, true);
		}
		FileOutputFormat.setOutputPath(job, outputDir);
		System.exit(job.waitForCompletion(true)? 0: 1);
	}
}

运行结果：

结论：

如果仔细观察结果会发现，第二次运行结果没有重复的最大值出现，重复的值只出现了一次而且是最新进入TreeMap的值，这是因为TreeMap的属性的问题，这里我们不做过多的讨论，这篇博客只是熟悉Mapper类中可重写方法的用途。下一篇博客讲解自定义输入类型，并解决这一篇遗留的即要取name字段又要取cost字段，但还要保留做重复金额的问题。

doegoo

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
MapReduce-TopK

上一篇博客中简单的写了一个MapReduce的程序，其中只重写了map()和reduce()方法，但里面还有cleanup(Context context)，setup(Context context）和run(Context context)方法可以可以重写这一个实例，我们就说明一下cleanup(Context context)，setup(Context context）这两个方法的
复制链接

扫一扫

专栏目录