MapReduce--4--Combiner做MapTask局部合并

MapReduce中的Combiner详解

 

1、什么是Combiner

Combiner是MapReduce程序中Mapper和Reducer之外的一种组件,它的作用是在maptask之后给maptask的结果进行局部汇总,以减轻reducetask的计算负载,减少网络传输


2、Combiner的编写方式

Combiner和Reducer一样,编写一个类,然后继承Reducer,reduce方法中写具体的Combiner逻辑,然后在job中设置Combiner组件:job.setCombinerClass(WCCombiner.class)


以下是使用了Combiner组件的WordCount代码:

package com.ghgj.mazh.mapreduce.wc.demo2;
 
import java.io.IOException;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 
/**
 * 统计单词个数: wordCount,并且使用Combiner
 * 
 * 使用Combiner要注意以下三项注意:
 * 
 * 
 * 1、Combiner和Reducer的区别在于运行的位置:Combiner是在每一个MapTask所在的节点运行,Reducer是接收全局所有Mapper的输出结果
 
   2、Combiner的输入key-value的类型就是Mapper组件输出的key-value的类型,Combiner的输出key-value要跟reducer的输入key-value类型要对应起来
 
   3、Combiner的使用要非常谨慎,因为Combiner在MapReduce过程中是可选的组件,可能调用也可能不调用,可能调一次也可能调多次,
              所以:Combiner使用的原则是:有或没有都不能影响业务逻辑,都不能影响最终结果
 */
public class WordCount {
 
	public static void main(String[] args) throws Exception {
		// 指定hdfs相关的参数
		Configuration conf = new Configuration();
		conf.set("fs.defaultFS", "hdfs://hadoop06:9000");
		System.setProperty("HADOOP_USER_NAME", "hadoop");
		
		Job job = Job.getInstance(conf);
		// 设置jar包所在路径
		job.setJarByClass(WordCount.class);
		
		// 指定mapper类和reducer类
		job.setMapperClass(WCMapper.class);
		job.setReducerClass(WCReducer.class);
		job.setCombinerClass(WCCombiner.class);
		
		// 指定maptask的输出类型, 如果maptask的输出key-value类型和reduceTask的key-value类型一致。那么可以不写
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);
		
		// 指定reducetask的输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		
		// 本地路径
//		Path inputPath = new Path("d:/bigdata/wordcount/input");
//		Path outputPath = new Path("d:/bigdata/wordcount/output");
		Path inputPath = new Path(args[0]);
		Path outputPath = new Path(args[1]);
		
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(outputPath)){
			fs.delete(outputPath, true);
		}
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		
		// 最后提交任务
		boolean waitForCompletion = job.waitForCompletion(true);
		System.exit(waitForCompletion?0:1);
	}
	
	/**
	 * 作者: 马中华:http://blog.csdn.net/zhongqi2513
	 * 日期: 2017年10月23日下午6:00:35
	 * 
	 * 描述:Mapper组件
	 */
	private static class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			// 在此写maptask的业务代码
			String[] words = value.toString().split(" ");
			for(String word: words){
				context.write(new Text(word), new LongWritable(1));
			}
		}
	}
	
	/**
	 * 作者: 马中华:http://blog.csdn.net/zhongqi2513
	 * 日期: 2017年10月23日下午6:00:50
	 * 
	 * 描述:Reducer组件
	 */
	private static class WCReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
		@Override
		protected void reduce(Text key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {
			// 在此写reducetask的业务代码
			long sum = 0;
			for(LongWritable v: values){
				sum += v.get();
			}
			context.write(key, new LongWritable(sum));
		}
	}
	
	/**
	 * 作者: 马中华:http://blog.csdn.net/zhongqi2513
	 * 日期: 2017年10月23日下午6:01:02
	 * 
	 * 描述:Combiner组件
	 */
	private static class WCCombiner extends Reducer<Text, LongWritable, Text, LongWritable>{
		@Override
		protected void reduce(Text key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {
			// 在此写combiner的业务代码
			long sum = 0;
			for(LongWritable v: values){
				sum += v.get();
			}
			context.write(key, new LongWritable(sum));
		}
	}
}

以下是没有使用Combiner组件的WordCount代码:

package com.ghgj.mazh.mapreduce.wc.demo2;
 
import java.io.IOException;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 
/**
 * 统计单词个数: wordCount,并且使用Combiner
 * 
 * 使用Combiner要注意以下三项注意:
 * 
 * 
 * 1、Combiner和Reducer的区别在于运行的位置:Combiner是在每一个MapTask所在的节点运行,Reducer是接收全局所有Mapper的输出结果
 
   2、Combiner的输入key-value的类型就是Mapper组件输出的key-value的类型,Combiner的输出key-value要跟reducer的输入key-value类型要对应起来
 
   3、Combiner的使用要非常谨慎,因为Combiner在MapReduce过程中是可选的组件,可能调用也可能不调用,可能调一次也可能调多次,
              所以:Combiner使用的原则是:有或没有都不能影响业务逻辑,都不能影响最终结果
 */
public class WordCount {
 
	public static void main(String[] args) throws Exception {
		// 指定hdfs相关的参数
		Configuration conf = new Configuration();
		conf.set("fs.defaultFS", "hdfs://hadoop06:9000");
		System.setProperty("HADOOP_USER_NAME", "hadoop");
		
		Job job = Job.getInstance(conf);
		// 设置jar包所在路径
		job.setJarByClass(WordCount.class);
		
		// 指定mapper类和reducer类
		job.setMapperClass(WCMapper.class);
		job.setReducerClass(WCReducer.class);
		
		// 指定maptask的输出类型, 如果maptask的输出key-value类型和reduceTask的key-value类型一致。那么可以不写
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);
		
		// 指定reducetask的输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		
		// 本地路径
//		Path inputPath = new Path("d:/bigdata/wordcount/input");
//		Path outputPath = new Path("d:/bigdata/wordcount/output");
		Path inputPath = new Path(args[0]);
		Path outputPath = new Path(args[1]);
		
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(outputPath)){
			fs.delete(outputPath, true);
		}
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		
		// 最后提交任务
		boolean waitForCompletion = job.waitForCompletion(true);
		System.exit(waitForCompletion?0:1);
	}
	
	/**
	 * 作者: 马中华:http://blog.csdn.net/zhongqi2513
	 * 日期: 2017年10月23日下午6:00:35
	 * 
	 * 描述:Mapper组件
	 */
	private static class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			// 在此写maptask的业务代码
			String[] words = value.toString().split(" ");
			for(String word: words){
				context.write(new Text(word), new LongWritable(1));
			}
		}
	}
	
	/**
	 * 作者: 马中华:http://blog.csdn.net/zhongqi2513
	 * 日期: 2017年10月23日下午6:00:50
	 * 
	 * 描述:Reducer组件
	 */
	private static class WCReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
		@Override
		protected void reduce(Text key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {
			// 在此写reducetask的业务代码
			long sum = 0;
			for(LongWritable v: values){
				sum += v.get();
			}
			context.write(key, new LongWritable(sum));
		}
	}
}

接着再来看使用Combiner和不使用Combiner之前的变化:

这是使用了Combiner之后的运行日志和计数器结果:

[hadoop@hadoop08 ~]$ hadoop jar wc.jar com.ghgj.mazh.mapreduce.wc.demo2.WordCount_Combiner /wc/input/ /wc/output/
17/10/23 18:08:50 INFO client.RMProxy: Connecting to ResourceManager at hadoop08/192.168.123.108:8032
17/10/23 18:08:50 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
17/10/23 18:08:51 INFO input.FileInputFormat: Total input paths to process : 1
17/10/23 18:08:51 INFO mapreduce.JobSubmitter: number of splits:1
17/10/23 18:08:51 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1508718609774_0001
17/10/23 18:08:51 INFO impl.YarnClientImpl: Submitted application application_1508718609774_0001
17/10/23 18:08:51 INFO mapreduce.Job: The url to track the job: http://hadoop08:8088/proxy/application_1508718609774_0001/
17/10/23 18:08:51 INFO mapreduce.Job: Running job: job_1508718609774_0001
17/10/23 18:09:00 INFO mapreduce.Job: Job job_1508718609774_0001 running in uber mode : false
17/10/23 18:09:00 INFO mapreduce.Job:  map 0% reduce 0%
17/10/23 18:09:05 INFO mapreduce.Job:  map 100% reduce 0%
17/10/23 18:09:13 INFO mapreduce.Job:  map 100% reduce 100%
17/10/23 18:09:13 INFO mapreduce.Job: Job job_1508718609774_0001 completed successfully
17/10/23 18:09:13 INFO mapreduce.Job: Counters: 49
        File System Counters
                FILE: Number of bytes read=168
                FILE: Number of bytes written=242337
                FILE: Number of read operations=0
                FILE: Number of large read operations=0
                FILE: Number of write operations=0
                HDFS: Number of bytes read=222
                HDFS: Number of bytes written=82
                HDFS: Number of read operations=6
                HDFS: Number of large read operations=0
                HDFS: Number of write operations=2
        Job Counters 
                Launched map tasks=1
                Launched reduce tasks=1
                Data-local map tasks=1
                Total time spent by all maps in occupied slots (ms)=3119
                Total time spent by all reduces in occupied slots (ms)=4149
                Total time spent by all map tasks (ms)=3119
                Total time spent by all reduce tasks (ms)=4149
                Total vcore-milliseconds taken by all map tasks=3119
                Total vcore-milliseconds taken by all reduce tasks=4149
                Total megabyte-milliseconds taken by all map tasks=3193856
                Total megabyte-milliseconds taken by all reduce tasks=4248576
        Map-Reduce Framework
                Map input records=8
                Map output records=22
                Map output bytes=297
                Map output materialized bytes=168
                Input split bytes=101
                Combine input records=22
                Combine output records=10
                Reduce input groups=10
                Reduce shuffle bytes=168
                Reduce input records=10
                Reduce output records=10
                Spilled Records=20
                Shuffled Maps =1
                Failed Shuffles=0
                Merged Map outputs=1
                GC time elapsed (ms)=110
                CPU time spent (ms)=780
                Physical memory (bytes) snapshot=292769792
                Virtual memory (bytes) snapshot=4127346688
                Total committed heap usage (bytes)=137498624
        Shuffle Errors
                BAD_ID=0
                CONNECTION=0
                IO_ERROR=0
                WRONG_LENGTH=0
                WRONG_MAP=0
                WRONG_REDUCE=0
        File Input Format Counters 
                Bytes Read=121
        File Output Format Counters 
                Bytes Written=82

 

这是没有使用Combiner的运行日志和计数器结果:

[hadoop@hadoop08 ~]$ hadoop jar wc.jar com.ghgj.mazh.mapreduce.wc.demo2.WordCount /wc/input/ /wc/output2/
17/10/23 18:16:14 INFO client.RMProxy: Connecting to ResourceManager at hadoop08/192.168.123.108:8032
17/10/23 18:16:15 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
17/10/23 18:16:15 INFO input.FileInputFormat: Total input paths to process : 1
17/10/23 18:16:15 INFO mapreduce.JobSubmitter: number of splits:1
17/10/23 18:16:15 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1508718609774_0002
17/10/23 18:16:15 INFO impl.YarnClientImpl: Submitted application application_1508718609774_0002
17/10/23 18:16:15 INFO mapreduce.Job: The url to track the job: http://hadoop08:8088/proxy/application_1508718609774_0002/
17/10/23 18:16:15 INFO mapreduce.Job: Running job: job_1508718609774_0002
17/10/23 18:16:22 INFO mapreduce.Job: Job job_1508718609774_0002 running in uber mode : false
17/10/23 18:16:22 INFO mapreduce.Job:  map 0% reduce 0%
17/10/23 18:16:27 INFO mapreduce.Job:  map 100% reduce 0%
17/10/23 18:16:34 INFO mapreduce.Job:  map 100% reduce 100%
17/10/23 18:16:34 INFO mapreduce.Job: Job job_1508718609774_0002 completed successfully
17/10/23 18:16:34 INFO mapreduce.Job: Counters: 49
        File System Counters
                FILE: Number of bytes read=347
                FILE: Number of bytes written=242325
                FILE: Number of read operations=0
                FILE: Number of large read operations=0
                FILE: Number of write operations=0
                HDFS: Number of bytes read=222
                HDFS: Number of bytes written=82
                HDFS: Number of read operations=6
                HDFS: Number of large read operations=0
                HDFS: Number of write operations=2
        Job Counters 
                Launched map tasks=1
                Launched reduce tasks=1
                Data-local map tasks=1
                Total time spent by all maps in occupied slots (ms)=2458
                Total time spent by all reduces in occupied slots (ms)=4251
                Total time spent by all map tasks (ms)=2458
                Total time spent by all reduce tasks (ms)=4251
                Total vcore-milliseconds taken by all map tasks=2458
                Total vcore-milliseconds taken by all reduce tasks=4251
                Total megabyte-milliseconds taken by all map tasks=2516992
                Total megabyte-milliseconds taken by all reduce tasks=4353024
        Map-Reduce Framework
                Map input records=8
                Map output records=22
                Map output bytes=297
                Map output materialized bytes=347
                Input split bytes=101
                Combine input records=0
                Combine output records=0
                Reduce input groups=10
                Reduce shuffle bytes=347
                Reduce input records=22
                Reduce output records=10
                Spilled Records=44
                Shuffled Maps =1
                Failed Shuffles=0
                Merged Map outputs=1
                GC time elapsed (ms)=105
                CPU time spent (ms)=750
                Physical memory (bytes) snapshot=291364864
                Virtual memory (bytes) snapshot=4127207424
                Total committed heap usage (bytes)=138211328
        Shuffle Errors
                BAD_ID=0
                CONNECTION=0
                IO_ERROR=0
                WRONG_LENGTH=0
                WRONG_MAP=0
                WRONG_REDUCE=0
        File Input Format Counters 
                Bytes Read=121
        File Output Format Counters 
                Bytes Written=82

在使用Combiner之后:

                Map input records=8
                Map output records=22
                Map output bytes=297
                Map output materialized bytes=168
                Input split bytes=101
                Combine input records=22
                Combine output records=10
                Reduce input groups=10
                Reduce shuffle bytes=168
                Reduce input records=10
                Reduce output records=10
                Spilled Records=20

在没有使用Combiner之前:

                Map input records=8
                Map output records=22
                Map output bytes=297
                Map output materialized bytes=347
                Input split bytes=101
                Combine input records=0
                Combine output records=0
                Reduce input groups=10
                Reduce shuffle bytes=347
                Reduce input records=22
                Reduce output records=10
                Spilled Records=44

这就是有没有使用Combiner能看到的变化。
通过对比,能够发现两段计数器的变化:使用了Combiner之后,Mapper端输出的数据量由44减少到20了,Reducer接收到的数据量也由22减少到10了。所以Combiner最大的作用就是减少Mapper端到Reducer端的网络数据传输。

3、Combiner的注意事项

1、Combiner和Reducer的区别在于运行的位置:Combiner是在每一个MapTask所在的节点运行,Reducer是接收全局所有Mapper的输出结果
 
 2、Combiner的输入key-value的类型就是Mapper组件输出的key-value的类型,Combiner的输出key-value要跟reducer的输入key-value类型要对应起来
 
 3、Combiner的使用要非常谨慎,因为实际业务场景中,大部分的Combiner的逻辑和Reducer的逻辑是一致的,所以,在这些业务场景中,可以直接使用Reducer充当Combiner,因为Combiner在MapReduce过程中是可选的组件,可能调用也可能不调用,可能调一次也可能调多次,所以:Combiner使用的原则是:有或没有都不能影响业务逻辑,都不能影响最终结果

欢迎拍砖!!!

--------------------- 
作者:中琦2513 
来源:CSDN 
原文:https://blog.csdn.net/zhongqi2513/article/details/78321355 
版权声明:本文为博主原创文章,转载请附上博文链接!

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值