算法(1) 二级排序 Mapreduce/Spark

最新推荐文章于 2021-07-24 16:55:55 发布

tom_fans

最新推荐文章于 2021-07-24 16:55:55 发布

阅读量675

点赞数

分类专栏： Mapreduce

本文链接：https://blog.csdn.net/tom_fans/article/details/78228370

版权

Mapreduce 专栏收录该内容

8 篇文章 1 订阅

订阅专栏

数据：

2012,01,01,5
2012,01,02,45
2012,01,03,35
2012,01,04,10
2001,11,01,46
2001,11,02,47
2001,11,03,48
2001,11,04,40
2005,08,20,50
2005,08,21,52
2005,08,22,38
2005,08,23,70

需求：

数据包含2个内容，年月日以及温度，要求输出结果为：根据年-月排序，同一个key的温度必须排序（升序降序都可以）

默认不排序的实现如下：

package com.isesol.mapreduce;

import java.io.IOException;
import java.util.ArrayList;
import java.util.StringTokenizer;
import java.util.Vector;
import org.apache.commons.net.PrintCommandListener;
import org.apache.commons.net.nntp.NewsgroupInfo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SplitLineReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet.COL;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import jdk.internal.org.objectweb.asm.tree.analysis.Value;

public class secondary_sort {

	public static class TokenizerMapper extends Mapper<Object, Text, Text, Text> {

		private Text data = new Text();
		private IntWritable result = new IntWritable();

		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

			String[] val = value.toString().split(",");
			String year = val[0];
			String month = val[1];
			context.write(new Text(year + "-" + month), new Text(val[3]));

		}
	}

	public static class IntSumReducer extends Reducer<Text, Text, Text, Text> {

		public void reduce(Text key, Iterable<Text> value, Context context) throws IOException, InterruptedException {

			StringBuilder str = new StringBuilder("");

			for (Text values : value) {
				
				String val = values.toString();
				str.append(val);
				str.append(",");
			}

			context.write(key, new Text(str.toString()));

		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf, "secondary_sort");
		job.setJarByClass(secondary_sort.class);
		job.setMapperClass(TokenizerMapper.class);
		// job.setCombinerClass(IntSumReducer.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		job.setNumReduceTasks(1);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

输出结果为：

2001-11	40,48,47,46,
2005-08	70,38,52,50,
2012-01	10,35,45,5,

从结果看，key是排序的，但是value是乱序，我们希望value也是按照数字大小排序。要实现这种需求实际就是二次排序，mapreduce只能对key排序，要实现对value排序，那就要对key构建复合键：(key,value) ，复合键作为key，这样在排序的时候就可以一起排序。在我们的例子中，实际就是不仅要对年月排序，还要对温度排序。

下图括号括住的就是复合键，通过自定义partition, shuffle/sort, group ,我们就可以实现如下过程:

构建复合键排序程序如下：

package com.isesol.mapreduce;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.StringTokenizer;
import java.util.Vector;
import org.apache.avro.reflect.DateAsLongEncoding;
import org.apache.commons.net.PrintCommandListener;
import org.apache.commons.net.nntp.NewsgroupInfo;
import org.apache.curator.retry.RetryUntilElapsed;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.KeyValue.RawBytesComparator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SplitLineReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.security.protocolPB.RefreshUserMappingsProtocolClientSideTranslatorPB;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet.COL;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

public class secondary_sort {

	public static class TokenizerMapper extends Mapper<Object, Text, compositekey, Text> {

		private Text data = new Text();
		private compositekey newkey = new compositekey();

		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
			String[] val = value.toString().split(",");
			String year = val[0];
			String month = val[1];
			newkey.setYear(year);
			newkey.setWendu(val[3]);

			System.out.println(newkey.getYear() + "-" + newkey.getWendu());

			context.write(newkey, new Text(val[3]));
		}
	}

	public static class IntSumReducer extends Reducer<compositekey, Text, Text, Text> {

		public void reduce(compositekey key, Iterable<Text> value, Context context)
				throws IOException, InterruptedException {

			StringBuilder str = new StringBuilder("");

			for (Text values : value) {

				String val = values.toString();
				str.append(val);
				str.append(",");
			}

			context.write(new Text(key.getYear()), new Text(str.toString()));

		}
	}

	public static class compositekeyComparator extends WritableComparator {

		public compositekeyComparator() {
			super(compositekey.class, true);
		}

		public int compare(WritableComparable a, WritableComparable b) {
			compositekey a1 = (compositekey) a;
			compositekey b1 = (compositekey) b;

			int compare = Integer.parseInt(a1.getYear()) - Integer.parseInt(b1.getYear());
			if (compare != 0) {
				return -1 * compare;
			} else {
				return Integer.parseInt(a1.getWendu()) - Integer.parseInt(b1.getWendu());
			}

		}

	}

	public static class compositekey implements WritableComparable<compositekey> {

		private String year;
		private String wendu;

		public void setYear(String year) {

			this.year = year;
		}

		public String getYear() {

			return this.year;
		}

		public void setWendu(String wendu) {
			this.wendu = wendu;
		}

		public String getWendu() {
			return wendu;
		}

		public void write(DataOutput out) throws IOException {
			// TODO Auto-generated method stub
			out.writeUTF(this.getYear());
			out.writeUTF(this.getWendu());
		}

		public void readFields(DataInput in) throws IOException {
			// TODO Auto-generated method stub

			year = in.readUTF();
			wendu = in.readUTF();
		}

		public String toString() {
			return year + "," + wendu;
		}

		public int compareTo(compositekey o) {
			// TODO Auto-generated method stub
			return 0;
		}

	}

	public static class twopartitions extends Partitioner<compositekey, Text> implements Configurable {

		public int getPartition(compositekey key, Text value, int numPartitions) {
			// TODO Auto-generated method stub

			System.out.println("aa-" + key.getYear() + "-" + key.getWendu());
			return (key.getYear().hashCode() & Integer.MAX_VALUE) % numPartitions;
		}

		public void setConf(Configuration conf) {
			// TODO Auto-generated method stub
		}

		public Configuration getConf() {
			// TODO Auto-generated method stub
			return null;
		}

	}

	public static class DefinedGroupSort extends WritableComparator {

		protected DefinedGroupSort() {
			super(compositekey.class, true);
		}

		@Override
		public int compare(WritableComparable a, WritableComparable b) {

			compositekey a1 = (compositekey) a;
			compositekey b1 = (compositekey) b;

			return Integer.parseInt(a1.getYear()) - Integer.parseInt(b1.getYear());
		}

	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf, "secondary_sort");
		job.setJarByClass(secondary_sort.class);
		job.setMapperClass(TokenizerMapper.class);
		job.setMapOutputKeyClass(compositekey.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		job.setPartitionerClass(twopartitions.class);
		job.setSortComparatorClass(compositekeyComparator.class);
		job.setGroupingComparatorClass(DefinedGroupSort.class);
		job.setNumReduceTasks(1);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

输出结果：

2012	5,10,35,45,
2005	38,50,52,70,
2001	40,46,47,48,

整个过程不复杂，但是有时候对于新手来说有点难于理解。下面我会对这个几个自定义过程做详细的剖析。我们先来了解整个mapreduce过程，map阶段，通过recodewrite读取原始数据，然后分片(inputsplit)，进入不同map，这个时候会有第一阶段的排序（为什么要排序？在讲解 shuffle/sort的时候会给出原因），然后根据分区算法，相同的key进入同一个partition, 这个时候实际上map过程就结束了，然后reducer端通过copy之前mapper端的分区数据，进入对应的reduce，在进入reduce之前，需要进行shuffle/sort，也就是对这些key排序，然后根据分组算法，合并同一个key的数据，然后由reduce处理。

有很多人对mapper包含什么，reducer包含什么有很多错误理解，大家看看官网怎么说的，下面是Reducer的，mapper的实际在partition之后就结束了：

Reducer

Reducer reduces a set of intermediate values which share a key to a smaller set of values.

The number of reduces for the job is set by the user via Job.setNumReduceTasks(int).

Overall, Reducer implementations are passed the Job for the job via the Job.setReducerClass(Class) method and can override it to initialize themselves. The framework then calls reduce(WritableComparable, Iterable<Writable>, Context) method for each <key, (list of values)> pair in the grouped inputs. Applications can then override the cleanup(Context)method to perform any required cleanup.

Reducer has 3 primary phases: shuffle, sort and reduce.

Shuffle

Input to the Reducer is the sorted output of the mappers. In this phase the framework fetches the relevant partition of the output of all the mappers, via HTTP.

Sort

The framework groups Reducer inputs by keys (since different mappers may have output the same key) in this stage.

The shuffle and sort phases occur simultaneously; while map-outputs are being fetched they are merged.

Secondary Sort

If equivalence rules for grouping the intermediate keys are required to be different from those for grouping keys before reduction, then one may specify a Comparator via Job.setSortComparatorClass(Class). Since Job.setGroupingComparatorClass(Class) can be used to control how intermediate keys are grouped, these can be used in conjunction to simulate secondary sort on values.

所以整个过程包含如何对key排序，如果分组，然后输出结果全部在Reducer里控制，在Mapper阶段我们能控制的也很多，比如分片，分区。

回到二次排序的问题，由于mapreduce只能够对key排序，所以要实现value的排序，必须把value及key构建成复合键(key, value)，这样value包含在key当中，就可以实现排序了，那么第一个问题我们要解决的就是构建复合键。

复合键构建完成之后，需要考虑分区，分区也只能对key分区，复合键如何像之前一样，相同的key进入同一个分区呢？很显然需要自定义分区算法，仍然按照原始key来分区，不能包含value.

在混排及排序阶段，需要先根据key排序，再按照value排序，来实现二级排序

在分组阶段，也必须只根据key来分组，而不能包含value.

总结一下，我们需要解决的几个问题：

1) 构建复合键

2) 自定义分区

3) 自定义sort

4) 自定义分组

接下来我们一个一个问题来解决.

1. 构建符合键

如果仅仅是构建复合键，只需要继承Writable即可，然后实现相应的方法。也许你还不清楚复合键的好处在哪里，我们举一个例子，一般情况我们在拼凑key的时候采用字符拼接，然后在处理的时候通过对应的方式去split的字符串，这么做也可以，但是比较麻烦，如果构建复合键之后，我们只需要通过set, get去处理数据，这就是优点。

package com.isesol.mapreduce;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class secondary_key implements Writable{
	
	private int avg;
	private int count;


	public void setAvg(int avg){
		
		this.avg = avg;
	}
	
	public int getAvg(){
		
		return this.avg;
	}
	
	
	
	public void setCount(int count){
		this.count = count;
	}
	
	public int getCount(){
		return count;
	}
	
	public void write(DataOutput out) throws IOException {
		// TODO Auto-generated method stub
		out.writeInt(this.getAvg());
		out.writeInt(this.getCount());

	}

	public void readFields(DataInput in) throws IOException {
		// TODO Auto-generated method stub
		
		avg = in.readInt();
		count = in.readInt();

	}
	
	public String toString(){
		return avg + "-" + count;
	}

}

我们来写一段程序，构建复合键来处理数据输出，下面程序reduce部分的outputvalue为：secondary_key

package com.isesol.mapreduce;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.StringTokenizer;
import java.util.Vector;
import org.apache.avro.reflect.DateAsLongEncoding;
import org.apache.commons.net.PrintCommandListener;
import org.apache.commons.net.nntp.NewsgroupInfo;
import org.apache.curator.retry.RetryUntilElapsed;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.KeyValue.RawBytesComparator;
import org.apache.hadoop.hbase.quotas.OperationQuota.AvgOperationSize;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SplitLineReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;



public class ExampleCompositeKey {
	
	public static class TokenizerMapper extends Mapper<Object, Text, Text, Text> {
		
		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
			String[] val = value.toString().split(",");
			String year = val[0];
			String wendu = val[3];
			context.write(new Text(year), new Text(wendu));
		}
	}
	
	
	public static class IntSumReducer extends Reducer<Text, Text, Text, secondary_key> {

		private secondary_key newkey = new secondary_key();
		private int count = 0;
		private int i = 0;
		
		public void reduce(Text key, Iterable<Text> value, Context context)
				throws IOException, InterruptedException {

			for (Text values : value) {
				
				count += Integer.parseInt(values.toString());
				i++;
			}

			int avg = count / i ;
			newkey.setAvg(avg);
			newkey.setCount(count);

			
			context.write(key, newkey);

		}
	}
	

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf, "ExampleCompositeKey");
		job.setJarByClass(ExampleCompositeKey.class);
		job.setMapperClass(TokenizerMapper.class);
		job.setMapOutputValueClass(Text.class);
		job.setMapOutputKeyClass(Text.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(secondary_key.class);
		//job.setPartitionerClass(twopartitions.class);
		//job.setSortComparatorClass(compositekeyComparator.class);
		//job.setGroupingComparatorClass(DefinedGroupSort.class);
		job.setNumReduceTasks(1);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

}

输出结果：

2001	45-181
2005	48-391
2012	40-486

看到没有，结果是根据复合键输出的，输出规则由toString制定。大家可以通过key复合键，value复合键来练习一下，比较简单。

2. 自定义分区

默认的HASH partitioner为： return (key.hashCode() & Integer.MAX_VALUE) % numPartitions;

构建复合键之后，我期望分区仍然按照原始key来分区，而不是按照复合键来分区，所以在自定义partition的时候，输入为compositekey，

return (key.getYear().hashCode() & Integer.MAX_VALUE) % numPartitions;

3. 自定义shuffle//Sort

具体实现代码看上面的实现，这里只讲解一些重点，要实现排序需要继承WritableComparator, 然后实现compare方法，这个方法有2个参数，对应当前key，以及获取的最新的eky, 2个key做比较，如果返回负数，则降序，如果为正数，则为顺序。

我之前一直不明白在map阶段为什么要排序，按照我简单的理解，map数据直接给分区，然后相同key进入不同的分区，reduce再去copy数据然后进行shuffle/sort. group 等过程，在自定义实现sort我感觉到，如果之前不排序，请问2个key相比较如何就能排序呢？如果之前已经排了序，那么这里就容易理解了，实际上数据已经按照1,2,3,4排序了，

1-2=-1 那么顺序为2,1 , 然后2-3=-1, 顺序为:3,2,1, 3-4=-1, 结果：4,3,2,1，这个地方是我个人理解，不知道是否正确。但是如果之前不排序，我确实难于理解 2个key比较就能获得顺序。

4. 分组

这个地方没什么可说的，对比2个值，看是否相同，相同那就是一个组。