目录
Conbiner出现的本质:
在map之后,如果Map阶段不进行合并的话,到达reduce端的数据将是下面这种类型的:<a,1><a,1><a,1>,reducer要处理的工作量大,还要消耗大量的IO,reduce的数量是相对于map是更少的,所以可以把合并的工作交给map来做,这个工作就是combiner的工作,经过combiner之后的结果集是这样的:<a,3> <b,2>
Reducer的作用是什么?是聚合,那么combiner相当于是提前聚合了。
需求:
统计过程中对每一个MapTask的输出进行局部汇总,以减小网络传输量即采用Combiner功能。期望:Combine输入数据多,输出时经过合并,输出数据降低。
需求分析:
方案一实现:
package com.isea.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
private Text k = new Text();
private IntWritable v = new IntWritable(1);
@Override//map方法每一个<k,v>都会调用一次map方法
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// System.out.println(key.toString());
// 1,获取一行
String line = value.toString();
// 2,切割
String[] words = line.split(" ");
// 3,输出
for (String word : words) {
k.set(word);
context.write(k,v);
}
}
}
package com.isea.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
private int sum = 0;
private IntWritable v = new IntWritable();
@Override//this method is called once for each key
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
// 1,累加求和
for (IntWritable value : values) {
sum += value.get();
//相当于sum++
}
// 2,输出
v.set(sum);
context.write(key,v);
}
}
package com.isea.mr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
args = new String[]{"g:/input/hello.txt","g:/output"};
// 1,获取配置信息,获取job任务实例
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
/*// 设置InputFormat
job.setInputFormatClass(CombineTextInputFormat.class);
// 设置虚拟存储切片的最大值为4m
CombineTextInputFormat.setMaxInputSplitSize(job,1024 * 1024 * 4);*/
job.setCombinerClass(WordCountConbiner.class);
// job.setNumReduceTasks(2);
// 2,指定本程序的jar包所在的本地路径
job.setJarByClass(WordCountDriver.class);
// 3,使job关联Mapper、Reducer业务类
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
// 4,指定Mapper的输出数据的<k,v>类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5,指定最终的数据的<k,v>类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6,指定job的输入文件的路径和输出路径(该目录必须不存在)
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
// 7,提交作业
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
package com.isea.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountConbiner extends Reducer<Text, IntWritable,Text,IntWritable> {
private IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
// 累加求和
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
v.set(sum);
context.write(key,v);
}
}