MapReducer的Combiner的功能是为了提前在各个节点上聚合,减少最后所有的节点所有数据最终聚合到一个Reducer中,减少网络资源使用,Combiner其实是一个特殊的Reducer,总结以下作为combiner的优点:
(*)Combiner是一种特殊的Reducer
(*)在Map端执行,对Map的输出进行一次本地聚合,减少输出到Reducer端的数据量
(*)减少网络的开销,提高性能。
(*)优化一种方式。
(*)一般来说,都需要Combiner
(*)注意:有些情况不能使用Combiner,举例:除法(平均值)
引入Combiner以后,不能改变原来的逻辑
需要注意的是combine的输入和reduce的完全一致,输出和map的完全一致,不然的话会报类型转换错误的
因为map的输出和reducer的输入是一致的,所以combiner和reducer的输入和输出因保持完全一致(这句话就对应的是引入combiner不能改变原来逻辑)
还有如果combiner的处理逻辑和reducer的处理逻辑一样的话,我们完全可以吧reducer作为combiner使用
示例代码:
//实体类
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class Employee implements Writable
{
private int empno; //员工号
private String name ; //员工姓名
private String job; //员工的工作
private int mgr; //员工的老板号
private String hiredate; //员工的雇佣日期
private int sal; //员工的薪水
private int comm; //员工
private int deptno;//员工的部门号
@Override
public String toString() {
return "Employee [empno=" + empno + ", name=" + name + ", job=" + job + ", mgr=" + mgr + ", hiredate="
+ hiredate + ", sal=" + sal + ", comm=" + comm + ", deptno=" + deptno + "]";
}
@Override
public void readFields(DataInput input) throws IOException {
input.readInt();
input.readUTF();
input.readUTF();
input.readInt();
input.readUTF();
input.readInt();
input.readInt();
input.readInt();
}
@Override
public void write(DataOutput output) throws IOException {
output.writeInt(this.empno);
output.writeUTF(this.name);
output.writeUTF(this.job);
output.writeInt(this.mgr);
output.writeUTF(this.hiredate);
output.writeInt(sal);
output.writeInt(this.comm);;
output.writeInt(this.deptno);
}
public int getEmpno() {
return empno;
}
public void setEmpno(int empno) {
this.empno = empno;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getJob() {
return job;
}
public void setJob(String job) {
this.job = job;
}
public int getMgr() {
return mgr;
}
public void setMgr(int mgr) {
this.mgr = mgr;
}
public String getHiredate() {
return hiredate;
}
public void setHiredate(String hiredate) {
this.hiredate = hiredate;
}
public int getSal() {
return sal;
}
public void setSal(int sal) {
this.sal = sal;
}
public int getComm() {
return comm;
}
public void setComm(int comm) {
this.comm = comm;
}
public int getDeptno() {
return deptno;
}
public void setDeptno(int deptno) {
this.deptno = deptno;
}
}
//mapper
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class EmployeeMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable>
{
@Override
protected void map(LongWritable key1, Text value1, Context context)
throws IOException, InterruptedException {
String[] words = value1.toString().split(",");
Employee employee = new Employee();
employee.setEmpno(Integer.valueOf(words[0]));
employee.setName(words[1]);
employee.setJob(words[2]);
employee.setMgr(Integer.valueOf(words[3]));
employee.setHiredate(words[4]);
employee.setSal(Integer.valueOf(words[5]));
employee.setComm(Integer.valueOf(words[6]));
employee.setDeptno(Integer.valueOf(words[7]));
context.write(new IntWritable(employee.getDeptno()), new IntWritable(employee.getSal()));
}
}
//reducer
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class EmployeeReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>
{
@Override
protected void reduce(IntWritable key3, Iterable<IntWritable> value3,Context context)
throws IOException, InterruptedException {
int salary=0;
for (IntWritable intWritable : value3)
{
salary+=intWritable.get();
}
context.write(key3, new IntWritable(salary));
}
}
//combiner
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class Combiner extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>
{
@Override
protected void reduce(IntWritable key3, Iterable<IntWritable> value3,Context context)
throws IOException, InterruptedException {
//根据相同部门号提前进行一次聚合
int salary=0;
for (IntWritable sal : value3)
{
salary+=sal.get();
}
context.write(key3, new IntWritable(salary));
}
}
//jobmain
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class EmployeeJobMain
{
public static void main(String[] args) throws Exception {
//指定job
Job job = Job.getInstance(new Configuration());
job.setJarByClass(EmployeeJobMain.class);
//指定map和map的输出类型
job.setMapperClass(EmployeeMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setCombinerClass(Combiner.class);
//指定reducer和reducer的输出类型
job.setReducerClass(EmployeeReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
//指定文件的输入路径和输出路劲
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//执行任务
job.waitForCompletion(true);
}
}