MapReduce--4--Combiner做MapTask局部合并

最新推荐文章于 2021-08-11 12:19:43 发布

中琦2513

最新推荐文章于 2021-08-11 12:19:43 发布

阅读量4.6w

点赞数 3

分类专栏： MapReduce Hadoop 文章标签： MapReduce Combiner

本文链接：https://blog.csdn.net/zhongqi2513/article/details/78321355

版权

Hadoop 同时被 2 个专栏收录

39 篇文章 9 订阅

订阅专栏

MapReduce

20 篇文章 7 订阅

订阅专栏

MapReduce中的Combiner详解

1、什么是Combiner

Combiner是MapReduce程序中Mapper和Reducer之外的一种组件，它的作用是在maptask之后给maptask的结果进行局部汇总，以减轻reducetask的计算负载，减少网络传输

2、Combiner的编写方式

Combiner和Reducer一样，编写一个类，然后继承Reducer，reduce方法中写具体的Combiner逻辑，然后在job中设置Combiner组件：job.setCombinerClass(WCCombiner.class)

以下是使用了Combiner组件的WordCount代码：

package com.ghgj.mazh.mapreduce.wc.demo2;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 统计单词个数: wordCount,并且使用Combiner
 * 
 * 使用Combiner要注意以下三项注意：
 * 
 * 
 * 1、Combiner和Reducer的区别在于运行的位置：Combiner是在每一个MapTask所在的节点运行，Reducer是接收全局所有Mapper的输出结果
 
   2、Combiner的输入key-value的类型就是Mapper组件输出的key-value的类型，Combiner的输出key-value要跟reducer的输入key-value类型要对应起来
 
   3、Combiner的使用要非常谨慎，因为Combiner在MapReduce过程中是可选的组件，可能调用也可能不调用，可能调一次也可能调多次，
              所以：Combiner使用的原则是：有或没有都不能影响业务逻辑，都不能影响最终结果
 */
public class WordCount {

	public static void main(String[] args) throws Exception {
		// 指定hdfs相关的参数
		Configuration conf = new Configuration();
		conf.set("fs.defaultFS", "hdfs://hadoop06:9000");
		System.setProperty("HADOOP_USER_NAME", "hadoop");
		
		Job job = Job.getInstance(conf);
		// 设置jar包所在路径
		job.setJarByClass(WordCount.class);
		
		// 指定mapper类和reducer类
		job.setMapperClass(WCMapper.class);
		job.setReducerClass(WCReducer.class);
		job.setCombinerClass(WCCombiner.class);
		
		// 指定maptask的输出类型， 如果maptask的输出key-value类型和reduceTask的key-value类型一致。那么可以不写
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);
		
		// 指定reducetask的输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		
		// 本地路径
//		Path inputPath = new Path("d:/bigdata/wordcount/input");
//		Path outputPath = new Path("d:/bigdata/wordcount/output");
		Path inputPath = new Path(args[0]);
		Path outputPath = new Path(args[1]);
		
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(outputPath)){
			fs.delete(outputPath, true);
		}
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		
		// 最后提交任务
		boolean waitForCompletion = job.waitForCompletion(true);
		System.exit(waitForCompletion?0:1);
	}
	
	/**
	 * 作者： 马中华：http://blog.csdn.net/zhongqi2513
	 * 日期： 2017年10月23日下午6:00:35
	 * 
	 * 描述：Mapper组件
	 */
	private static class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			// 在此写maptask的业务代码
			String[] words = value.toString().split(" ");
			for(String word: words){
				context.write(new Text(word), new LongWritable(1));
			}
		}
	}
	
	/**
	 * 作者： 马中华：http://blog.csdn.net/zhongqi2513
	 * 日期： 2017年10月23日下午6:00:50
	 * 
	 * 描述：Reducer组件
	 */
	private static class WCReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
		@Override
		protected void reduce(Text key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {
			// 在此写reducetask的业务代码
			long sum = 0;
			for(LongWritable v: values){
				sum += v.get();
			}
			context.write(key, new LongWritable(sum));
		}
	}
	
	/**
	 * 作者： 马中华：http://blog.csdn.net/zhongqi2513
	 * 日期： 2017年10月23日下午6:01:02
	 * 
	 * 描述：Combiner组件
	 */
	private static class WCCombiner extends Reducer<Text, LongWritable, Text, LongWritable>{
		@Override
		protected void reduce(Text key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {
			// 在此写combiner的业务代码
			long sum = 0;
			for(LongWritable v: values){
				sum += v.get();
			}
			context.write(key, new LongWritable(sum));
		}
	}
}

以下是没有使用Combiner组件的WordCount代码：

package com.ghgj.mazh.mapreduce.wc.demo2;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 统计单词个数: wordCount,并且使用Combiner
 * 
 * 使用Combiner要注意以下三项注意：
 * 
 * 
 * 1、Combiner和Reducer的区别在于运行的位置：Combiner是在每一个MapTask所在的节点运行，Reducer是接收全局所有Mapper的输出结果
 
   2、Combiner的输入key-value的类型就是Mapper组件输出的key-value的类型，Combiner的输出key-value要跟reducer的输入key-value类型要对应起来
 
   3、Combiner的使用要非常谨慎，因为Combiner在MapReduce过程中是可选的组件，可能调用也可能不调用，可能调一次也可能调多次，
              所以：Combiner使用的原则是：有或没有都不能影响业务逻辑，都不能影响最终结果
 */
public class WordCount {

	public static void main(String[] args) throws Exception {
		// 指定hdfs相关的参数
		Configuration conf = new Configuration();
		conf.set("fs.defaultFS", "hdfs://hadoop06:9000");
		System.setProperty("HADOOP_USER_NAME", "hadoop");
		
		Job job = Job.getInstance(conf);
		// 设置jar包所在路径
		job.setJarByClass(WordCount.class);
		
		// 指定mapper类和reducer类
		job.setMapperClass(WCMapper.class);
		job.setReducerClass(WCReducer.class);
		
		// 指定maptask的输出类型， 如果maptask的输出key-value类型和reduceTask的key-value类型一致。那么可以不写
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);
		
		// 指定reducetask的输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		
		// 本地路径
//		Path inputPath = new Path("d:/bigdata/wordcount/input");
//		Path outputPath = new Path("d:/bigdata/wordcount/output");
		Path inputPath = new Path(args[0]);
		Path outputPath = new Path(args[1]);
		
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(outputPath)){
			fs.delete(outputPath, true);
		}
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		
		// 最后提交任务
		boolean waitForCompletion = job.waitForCompletion(true);
		System.exit(waitForCompletion?0:1);
	}
	
	/**
	 * 作者： 马中华：http://blog.csdn.net/zhongqi2513
	 * 日期： 2017年10月23日下午6:00:35
	 * 
	 * 描述：Mapper组件
	 */
	private static class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			// 在此写maptask的业务代码
			String[] words = value.toString().split(" ");
			for(String word: words){
				context.write(new Text(word), new LongWritable(1));
			}
		}
	}
	
	/**
	 * 作者： 马中华：http://blog.csdn.net/zhongqi2513
	 * 日期： 2017年10月23日下午6:00:50
	 * 
	 * 描述：Reducer组件
	 */
	private static class WCReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
		@Override
		protected void reduce(Text key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {
			// 在此写reducetask的业务代码
			long sum = 0;
			for(LongWritable v: values){
				sum += v.get();
			}
			context.write(key, new LongWritable(sum));
		}
	}
}

接着再来看使用Combiner和不使用Combiner之前的变化：

这是使用了Combiner之后的运行日志和计数器结果：

[hadoop@hadoop08 ~]$ hadoop jar wc.jar com.ghgj.mazh.mapreduce.wc.demo2.WordCount_Combiner /wc/input/ /wc/output/
17/10/23 18:08:50 INFO client.RMProxy: Connecting to ResourceManager at hadoop08/192.168.123.108:8032
17/10/23 18:08:50 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
17/10/23 18:08:51 INFO input.FileInputFormat: Total input paths to process : 1
17/10/23 18:08:51 INFO mapreduce.JobSubmitter: number of splits:1
17/10/23 18:08:51 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1508718609774_0001
17/10/23 18:08:51 INFO impl.YarnClientImpl: Submitted application application_1508718609774_0001
17/10/23 18:08:51 INFO mapreduce.Job: The url to track the job: http://hadoop08:8088/proxy/application_1508718609774_0001/
17/10/23 18:08:51 INFO mapreduce.Job: Running job: job_1508718609774_0001
17/10/23 18:09:00 INFO mapreduce.Job: Job job_1508718609774_0001 running in uber mode : false
17/10/23 18:09:00 INFO mapreduce.Job:  map 0% reduce 0%
17/10/23 18:09:05 INFO mapreduce.Job:  map 100% reduce 0%
17/10/23 18:09:13 INFO mapreduce.Job:  map 100% reduce 100%
17/10/23 18:09:13 INFO mapreduce.Job: Job job_1508718609774_0001 completed successfully
17/10/23 18:09:13 INFO mapreduce.Job: Counters: 49
        File System Counters
                FILE: Number of bytes read=168
                FILE: Number of bytes written=242337
                FILE: Number of read operations=0
                FILE: Number of large read operations=0
                FILE: Number of write operations=0
                HDFS: Number of bytes read=222
                HDFS: Number of bytes written=82
                HDFS: Number of read operations=6
                HDFS: Number of large read operations=0
                HDFS: Number of write operations=2
        Job Counters 
                Launched map tasks=1
                Launched reduce tasks=1
                Data-local map tasks=1
                Total time spent by all maps in occupied slots (ms)=3119
                Total time spent by all reduces in occupied slots (ms)=4149
                Total time spent by all map tasks (ms)=3119
                Total time spent by all reduce tasks (ms)=4149
                Total vcore-milliseconds taken by all map tasks=3119
                Total vcore-milliseconds taken by all reduce tasks=4149
                Total megabyte-milliseconds taken by all map tasks=3193856
                Total megabyte-milliseconds taken by all reduce tasks=4248576
        Map-Reduce Framework
                Map input records=8
                Map output records=22
                Map output bytes=297
                Map output materialized bytes=168
                Input split bytes=101
                Combine input records=22
                Combine output records=10
                Reduce input groups=10
                Reduce shuffle bytes=168
                Reduce input records=10
                Reduce output records=10
                Spilled Records=20
                Shuffled Maps =1
                Failed Shuffles=0
                Merged Map outputs=1
                GC time elapsed (ms)=110
                CPU time spent (ms)=780
                Physical memory (bytes) snapshot=292769792
                Virtual memory (bytes) snapshot=4127346688
                Total committed heap usage (bytes)=137498624
        Shuffle Errors
                BAD_ID=0
                CONNECTION=0
                IO_ERROR=0
                WRONG_LENGTH=0
                WRONG_MAP=0
                WRONG_REDUCE=0
        File Input Format Counters 
                Bytes Read=121
        File Output Format Counters 
                Bytes Written=82

这是没有使用Combiner的运行日志和计数器结果：

[hadoop@hadoop08 ~]$ hadoop jar wc.jar com.ghgj.mazh.mapreduce.wc.demo2.WordCount /wc/input/ /wc/output2/
17/10/23 18:16:14 INFO client.RMProxy: Connecting to ResourceManager at hadoop08/192.168.123.108:8032
17/10/23 18:16:15 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
17/10/23 18:16:15 INFO input.FileInputFormat: Total input paths to process : 1
17/10/23 18:16:15 INFO mapreduce.JobSubmitter: number of splits:1
17/10/23 18:16:15 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1508718609774_0002
17/10/23 18:16:15 INFO impl.YarnClientImpl: Submitted application application_1508718609774_0002
17/10/23 18:16:15 INFO mapreduce.Job: The url to track the job: http://hadoop08:8088/proxy/application_1508718609774_0002/
17/10/23 18:16:15 INFO mapreduce.Job: Running job: job_1508718609774_0002
17/10/23 18:16:22 INFO mapreduce.Job: Job job_1508718609774_0002 running in uber mode : false
17/10/23 18:16:22 INFO mapreduce.Job:  map 0% reduce 0%
17/10/23 18:16:27 INFO mapreduce.Job:  map 100% reduce 0%
17/10/23 18:16:34 INFO mapreduce.Job:  map 100% reduce 100%
17/10/23 18:16:34 INFO mapreduce.Job: Job job_1508718609774_0002 completed successfully
17/10/23 18:16:34 INFO mapreduce.Job: Counters: 49
        File System Counters
                FILE: Number of bytes read=347
                FILE: Number of bytes written=242325
                FILE: Number of read operations=0
                FILE: Number of large read operations=0
                FILE: Number of write operations=0
                HDFS: Number of bytes read=222
                HDFS: Number of bytes written=82
                HDFS: Number of read operations=6
                HDFS: Number of large read operations=0
                HDFS: Number of write operations=2
        Job Counters 
                Launched map tasks=1
                Launched reduce tasks=1
                Data-local map tasks=1
                Total time spent by all maps in occupied slots (ms)=2458
                Total time spent by all reduces in occupied slots (ms)=4251
                Total time spent by all map tasks (ms)=2458
                Total time spent by all reduce tasks (ms)=4251
                Total vcore-milliseconds taken by all map tasks=2458
                Total vcore-milliseconds taken by all reduce tasks=4251
                Total megabyte-milliseconds taken by all map tasks=2516992
                Total megabyte-milliseconds taken by all reduce tasks=4353024
        Map-Reduce Framework
                Map input records=8
                Map output records=22
                Map output bytes=297
                Map output materialized bytes=347
                Input split bytes=101
                Combine input records=0
                Combine output records=0
                Reduce input groups=10
                Reduce shuffle bytes=347
                Reduce input records=22
                Reduce output records=10
                Spilled Records=44
                Shuffled Maps =1
                Failed Shuffles=0
                Merged Map outputs=1
                GC time elapsed (ms)=105
                CPU time spent (ms)=750
                Physical memory (bytes) snapshot=291364864
                Virtual memory (bytes) snapshot=4127207424
                Total committed heap usage (bytes)=138211328
        Shuffle Errors
                BAD_ID=0
                CONNECTION=0
                IO_ERROR=0
                WRONG_LENGTH=0
                WRONG_MAP=0
                WRONG_REDUCE=0
        File Input Format Counters 
                Bytes Read=121
        File Output Format Counters 
                Bytes Written=82

在使用Combiner之后：

                Map input records=8
                Map output records=22
                Map output bytes=297
                Map output materialized bytes=168
                Input split bytes=101
                Combine input records=22
                Combine output records=10
                Reduce input groups=10
                Reduce shuffle bytes=168
                Reduce input records=10
                Reduce output records=10
                Spilled Records=20

在没有使用Combiner之前：

                Map input records=8
                Map output records=22
                Map output bytes=297
                Map output materialized bytes=347
                Input split bytes=101
                Combine input records=0
                Combine output records=0
                Reduce input groups=10
                Reduce shuffle bytes=347
                Reduce input records=22
                Reduce output records=10
                Spilled Records=44

这就是有没有使用Combiner能看到的变化。

通过对比，能够发现两段计数器的变化：使用了Combiner之后，Mapper端输出的数据量由44减少到20了，Reducer接收到的数据量也由22减少到10了。所以Combiner最大的作用就是减少Mapper端到Reducer端的网络数据传输。

3、Combiner的注意事项

1、Combiner和Reducer的区别在于运行的位置：Combiner是在每一个MapTask所在的节点运行，Reducer是接收全局所有Mapper的输出结果

2、Combiner的输入key-value的类型就是Mapper组件输出的key-value的类型，Combiner的输出key-value要跟reducer的输入key-value类型要对应起来

3、Combiner的使用要非常谨慎，因为实际业务场景中，大部分的Combiner的逻辑和Reducer的逻辑是一致的，所以，在这些业务场景中，可以直接使用Reducer充当Combiner，因为Combiner在MapReduce过程中是可选的组件，可能调用也可能不调用，可能调一次也可能调多次，所以：Combiner使用的原则是：有或没有都不能影响业务逻辑，都不能影响最终结果

欢迎拍砖！！！