MapReduce中的Combiner详解
1、什么是Combiner
Combiner是MapReduce程序中Mapper和Reducer之外的一种组件,它的作用是在maptask之后给maptask的结果进行局部汇总,以减轻reducetask的计算负载,减少网络传输
2、Combiner的编写方式
Combiner和Reducer一样,编写一个类,然后继承Reducer,reduce方法中写具体的Combiner逻辑,然后在job中设置Combiner组件:job.setCombinerClass(WCCombiner.class)
以下是使用了Combiner组件的WordCount代码:
package com.ghgj.mazh.mapreduce.wc.demo2;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 统计单词个数: wordCount,并且使用Combiner
*
* 使用Combiner要注意以下三项注意:
*
*
* 1、Combiner和Reducer的区别在于运行的位置:Combiner是在每一个MapTask所在的节点运行,Reducer是接收全局所有Mapper的输出结果
2、Combiner的输入key-value的类型就是Mapper组件输出的key-value的类型,Combiner的输出key-value要跟reducer的输入key-value类型要对应起来
3、Combiner的使用要非常谨慎,因为Combiner在MapReduce过程中是可选的组件,可能调用也可能不调用,可能调一次也可能调多次,
所以:Combiner使用的原则是:有或没有都不能影响业务逻辑,都不能影响最终结果
*/
public class WordCount {
public static void main(String[] args) throws Exception {
// 指定hdfs相关的参数
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://hadoop06:9000");
System.setProperty("HADOOP_USER_NAME", "hadoop");
Job job = Job.getInstance(conf);
// 设置jar包所在路径
job.setJarByClass(WordCount.class);
// 指定mapper类和reducer类
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
job.setCombinerClass(WCCombiner.class);
// 指定maptask的输出类型, 如果maptask的输出key-value类型和reduceTask的key-value类型一致。那么可以不写
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
// 指定reducetask的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 本地路径
// Path inputPath = new Path("d:/bigdata/wordcount/input");
// Path outputPath = new Path("d:/bigdata/wordcount/output");
Path inputPath = new Path(args[0]);
Path outputPath = new Path(args[1]);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outputPath)){
fs.delete(outputPath, true);
}
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
// 最后提交任务
boolean waitForCompletion = job.waitForCompletion(true);
System.exit(waitForCompletion?0:1);
}
/**
* 作者: 马中华:http://blog.csdn.net/zhongqi2513
* 日期: 2017年10月23日下午6:00:35
*
* 描述:Mapper组件
*/
private static class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 在此写maptask的业务代码
String[] words = value.toString().split(" ");
for(String word: words){
context.write(new Text(word), new LongWritable(1));
}
}
}
/**
* 作者: 马中华:http://blog.csdn.net/zhongqi2513
* 日期: 2017年10月23日下午6:00:50
*
* 描述:Reducer组件
*/
private static class WCReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
// 在此写reducetask的业务代码
long sum = 0;
for(LongWritable v: values){
sum += v.get();
}
context.write(key, new LongWritable(sum));
}
}
/**
* 作者: 马中华:http://blog.csdn.net/zhongqi2513
* 日期: 2017年10月23日下午6:01:02
*
* 描述:Combiner组件
*/
private static class WCCombiner extends Reducer<Text, LongWritable, Text, LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
// 在此写combiner的业务代码
long sum = 0;
for(LongWritable v: values){
sum += v.get();
}
context.write(key, new LongWritable(sum));
}
}
}
以下是没有使用Combiner组件的WordCount代码:
package com.ghgj.mazh.mapreduce.wc.demo2;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 统计单词个数: wordCount,并且使用Combiner
*
* 使用Combiner要注意以下三项注意:
*
*
* 1、Combiner和Reducer的区别在于运行的位置:Combiner是在每一个MapTask所在的节点运行,Reducer是接收全局所有Mapper的输出结果
2、Combiner的输入key-value的类型就是Mapper组件输出的key-value的类型,Combiner的输出key-value要跟reducer的输入key-value类型要对应起来
3、Combiner的使用要非常谨慎,因为Combiner在MapReduce过程中是可选的组件,可能调用也可能不调用,可能调一次也可能调多次,
所以:Combiner使用的原则是:有或没有都不能影响业务逻辑,都不能影响最终结果
*/
public class WordCount {
public static void main(String[] args) throws Exception {
// 指定hdfs相关的参数
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://hadoop06:9000");
System.setProperty("HADOOP_USER_NAME", "hadoop");
Job job = Job.getInstance(conf);
// 设置jar包所在路径
job.setJarByClass(WordCount.class);
// 指定mapper类和reducer类
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
// 指定maptask的输出类型, 如果maptask的输出key-value类型和reduceTask的key-value类型一致。那么可以不写
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
// 指定reducetask的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 本地路径
// Path inputPath = new Path("d:/bigdata/wordcount/input");
// Path outputPath = new Path("d:/bigdata/wordcount/output");
Path inputPath = new Path(args[0]);
Path outputPath = new Path(args[1]);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outputPath)){
fs.delete(outputPath, true);
}
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
// 最后提交任务
boolean waitForCompletion = job.waitForCompletion(true);
System.exit(waitForCompletion?0:1);
}
/**
* 作者: 马中华:http://blog.csdn.net/zhongqi2513
* 日期: 2017年10月23日下午6:00:35
*
* 描述:Mapper组件
*/
private static class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 在此写maptask的业务代码
String[] words = value.toString().split(" ");
for(String word: words){
context.write(new Text(word), new LongWritable(1));
}
}
}
/**
* 作者: 马中华:http://blog.csdn.net/zhongqi2513
* 日期: 2017年10月23日下午6:00:50
*
* 描述:Reducer组件
*/
private static class WCReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
// 在此写reducetask的业务代码
long sum = 0;
for(LongWritable v: values){
sum += v.get();
}
context.write(key, new LongWritable(sum));
}
}
}
接着再来看使用Combiner和不使用Combiner之前的变化:
这是使用了Combiner之后的运行日志和计数器结果:
[hadoop@hadoop08 ~]$ hadoop jar wc.jar com.ghgj.mazh.mapreduce.wc.demo2.WordCount_Combiner /wc/input/ /wc/output/
17/10/23 18:08:50 INFO client.RMProxy: Connecting to ResourceManager at hadoop08/192.168.123.108:8032
17/10/23 18:08:50 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
17/10/23 18:08:51 INFO input.FileInputFormat: Total input paths to process : 1
17/10/23 18:08:51 INFO mapreduce.JobSubmitter: number of splits:1
17/10/23 18:08:51 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1508718609774_0001
17/10/23 18:08:51 INFO impl.YarnClientImpl: Submitted application application_1508718609774_0001
17/10/23 18:08:51 INFO mapreduce.Job: The url to track the job: http://hadoop08:8088/proxy/application_1508718609774_0001/
17/10/23 18:08:51 INFO mapreduce.Job: Running job: job_1508718609774_0001
17/10/23 18:09:00 INFO mapreduce.Job: Job job_1508718609774_0001 running in uber mode : false
17/10/23 18:09:00 INFO mapreduce.Job: map 0% reduce 0%
17/10/23 18:09:05 INFO mapreduce.Job: map 100% reduce 0%
17/10/23 18:09:13 INFO mapreduce.Job: map 100% reduce 100%
17/10/23 18:09:13 INFO mapreduce.Job: Job job_1508718609774_0001 completed successfully
17/10/23 18:09:13 INFO mapreduce.Job: Counters: 49
File System Counters
FILE: Number of bytes read=168
FILE: Number of bytes written=242337
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=222
HDFS: Number of bytes written=82
HDFS: Number of read operations=6
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=1
Launched reduce tasks=1
Data-local map tasks=1
Total time spent by all maps in occupied slots (ms)=3119
Total time spent by all reduces in occupied slots (ms)=4149
Total time spent by all map tasks (ms)=3119
Total time spent by all reduce tasks (ms)=4149
Total vcore-milliseconds taken by all map tasks=3119
Total vcore-milliseconds taken by all reduce tasks=4149
Total megabyte-milliseconds taken by all map tasks=3193856
Total megabyte-milliseconds taken by all reduce tasks=4248576
Map-Reduce Framework
Map input records=8
Map output records=22
Map output bytes=297
Map output materialized bytes=168
Input split bytes=101
Combine input records=22
Combine output records=10
Reduce input groups=10
Reduce shuffle bytes=168
Reduce input records=10
Reduce output records=10
Spilled Records=20
Shuffled Maps =1
Failed Shuffles=0
Merged Map outputs=1
GC time elapsed (ms)=110
CPU time spent (ms)=780
Physical memory (bytes) snapshot=292769792
Virtual memory (bytes) snapshot=4127346688
Total committed heap usage (bytes)=137498624
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=121
File Output Format Counters
Bytes Written=82
这是没有使用Combiner的运行日志和计数器结果:
[hadoop@hadoop08 ~]$ hadoop jar wc.jar com.ghgj.mazh.mapreduce.wc.demo2.WordCount /wc/input/ /wc/output2/
17/10/23 18:16:14 INFO client.RMProxy: Connecting to ResourceManager at hadoop08/192.168.123.108:8032
17/10/23 18:16:15 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
17/10/23 18:16:15 INFO input.FileInputFormat: Total input paths to process : 1
17/10/23 18:16:15 INFO mapreduce.JobSubmitter: number of splits:1
17/10/23 18:16:15 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1508718609774_0002
17/10/23 18:16:15 INFO impl.YarnClientImpl: Submitted application application_1508718609774_0002
17/10/23 18:16:15 INFO mapreduce.Job: The url to track the job: http://hadoop08:8088/proxy/application_1508718609774_0002/
17/10/23 18:16:15 INFO mapreduce.Job: Running job: job_1508718609774_0002
17/10/23 18:16:22 INFO mapreduce.Job: Job job_1508718609774_0002 running in uber mode : false
17/10/23 18:16:22 INFO mapreduce.Job: map 0% reduce 0%
17/10/23 18:16:27 INFO mapreduce.Job: map 100% reduce 0%
17/10/23 18:16:34 INFO mapreduce.Job: map 100% reduce 100%
17/10/23 18:16:34 INFO mapreduce.Job: Job job_1508718609774_0002 completed successfully
17/10/23 18:16:34 INFO mapreduce.Job: Counters: 49
File System Counters
FILE: Number of bytes read=347
FILE: Number of bytes written=242325
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=222
HDFS: Number of bytes written=82
HDFS: Number of read operations=6
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=1
Launched reduce tasks=1
Data-local map tasks=1
Total time spent by all maps in occupied slots (ms)=2458
Total time spent by all reduces in occupied slots (ms)=4251
Total time spent by all map tasks (ms)=2458
Total time spent by all reduce tasks (ms)=4251
Total vcore-milliseconds taken by all map tasks=2458
Total vcore-milliseconds taken by all reduce tasks=4251
Total megabyte-milliseconds taken by all map tasks=2516992
Total megabyte-milliseconds taken by all reduce tasks=4353024
Map-Reduce Framework
Map input records=8
Map output records=22
Map output bytes=297
Map output materialized bytes=347
Input split bytes=101
Combine input records=0
Combine output records=0
Reduce input groups=10
Reduce shuffle bytes=347
Reduce input records=22
Reduce output records=10
Spilled Records=44
Shuffled Maps =1
Failed Shuffles=0
Merged Map outputs=1
GC time elapsed (ms)=105
CPU time spent (ms)=750
Physical memory (bytes) snapshot=291364864
Virtual memory (bytes) snapshot=4127207424
Total committed heap usage (bytes)=138211328
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=121
File Output Format Counters
Bytes Written=82
在使用Combiner之后:
Map input records=8
Map output records=22
Map output bytes=297
Map output materialized bytes=168
Input split bytes=101
Combine input records=22
Combine output records=10
Reduce input groups=10
Reduce shuffle bytes=168
Reduce input records=10
Reduce output records=10
Spilled Records=20
在没有使用Combiner之前:
Map input records=8
Map output records=22
Map output bytes=297
Map output materialized bytes=347
Input split bytes=101
Combine input records=0
Combine output records=0
Reduce input groups=10
Reduce shuffle bytes=347
Reduce input records=22
Reduce output records=10
Spilled Records=44
这就是有没有使用Combiner能看到的变化。
通过对比,能够发现两段计数器的变化:使用了Combiner之后,Mapper端输出的数据量由44减少到20了,Reducer接收到的数据量也由22减少到10了。所以Combiner最大的作用就是减少Mapper端到Reducer端的网络数据传输。
3、Combiner的注意事项
2、Combiner的输入key-value的类型就是Mapper组件输出的key-value的类型,Combiner的输出key-value要跟reducer的输入key-value类型要对应起来
3、Combiner的使用要非常谨慎,因为实际业务场景中,大部分的Combiner的逻辑和Reducer的逻辑是一致的,所以,在这些业务场景中,可以直接使用Reducer充当Combiner,因为Combiner在MapReduce过程中是可选的组件,可能调用也可能不调用,可能调一次也可能调多次,所以:Combiner使用的原则是:有或没有都不能影响业务逻辑,都不能影响最终结果
欢迎拍砖!!!