Hadoop自定义组件Combiner
Combiner
组件介绍
Combiner是一个特殊的Reduce组件 ,它处于Mapper和Reduce中间的一种组件,Combiner组件的父类就是Reducer.
Combiner
和Reducer
之间的区别在于运行的位置 ,Reducer是每一个接收全局的Map Task 所输出的结果,Combiner一般是在MapTask的节点中运行.
combiner
每一个map都会产生大量的本地输出,Combiner的作用就是对map输出的结果先做一次合并,以较少的map和reduce节点中的数据传输量,Combiner的存在就是提高当前网络IO传输的性能,是MapReduce的一种优化方法。
并不是所有情况下都能使用Combiner,Combiner适用于对记录汇总的场景(如求和),但是,求平均数的场景不适用于Combiner。如果可以使用Combiner,一般情况下,和我们的reduce函数是一致的。
combiner
特点
-
Combiner
是MR程序中Mapper
和Reduce
之外的一种组件,它到父类是Reducer
,它们到的区别在于运行的位置 -
Reduce
阶段的Reducer
是每一个接收全局的Map Task
所输出的结果 -
Combiner
是在合并排序后运行的。因此map端和reduce端都可以调用此函数。 -
Combiner
的存在就是提高当前网络IO传输的性能,是MapReduce的一种优化手段。 -
Combiner
在驱动类中的设置:job.setCombinerClass(MyCombiner.class);
实现思路
- 继承
Reducer
类 - 重写
reduce
方法,根据业务需求处理数据,处理完后调用context.write()
方法写出数据即可。
案例
需求分析
收集用户的电影评分数据,统计每个用户评分最高的10部电影
电影评分数据
电影名 评分 日期 用户名
move14 01 2012-12-38 user1
move14 01 2012-12-38 user3
move14 01 2012-12-38 user5
move13 02 2012-12-37 user1
move13 02 2012-12-37 user4
move13 02 2012-12-37 user6
move12 03 2012-12-36 user1
move12 03 2012-12-36 user5
move12 03 2012-12-36 user7
move11 04 2012-12-35 user1
move11 04 2012-12-35 user6
move11 04 2012-12-35 user8
move10 05 2012-12-34 user1
move10 05 2012-12-34 user7
move10 05 2012-12-34 user9
move09 06 2012-12-33 user1
move09 06 2012-12-33 user8
move09 06 2012-12-33 user10
move08 07 2012-12-32 user1
move08 07 2012-12-32 user9
move08 07 2012-12-32 user11
move07 08 2012-12-31 user1
move07 08 2012-12-31 user10
move07 08 2012-12-31 user12
move06 09 2012-12-30 user1
move06 09 2012-12-30 user11
move06 09 2012-12-30 user13
move05 10 2012-12-29 user1
move05 10 2012-12-29 user12
move05 10 2012-12-29 user14
move04 11 2012-12-28 user1
move04 11 2012-12-28 user13
move04 11 2012-12-28 user15
move03 12 2012-12-27 user1
move03 12 2012-12-27 user14
move03 12 2012-12-27 user16
move02 13 2012-12-26 user1
move02 13 2012-12-26 user15
move02 13 2012-12-26 user17
move01 14 2012-12-25 user1
move01 14 2012-12-25 user16
move01 14 2012-12-25 user18
move01 14 2012-12-24 user2
move01 14 2012-12-24 user17
move01 14 2012-12-24 user19
move02 13 2012-12-23 user2
move02 13 2012-12-23 user18
move02 13 2012-12-23 user20
move03 12 2012-12-22 user2
move03 12 2012-12-22 user19
move03 12 2012-12-22 user21
move04 11 2012-12-21 user2
move04 11 2012-12-21 user20
move04 11 2012-12-21 user22
move05 10 2012-12-20 user2
move05 10 2012-12-20 user21
move05 10 2012-12-20 user23
move06 09 2012-12-19 user2
move06 09 2012-12-19 user22
move06 09 2012-12-19 user24
move07 08 2012-12-18 user2
move07 08 2012-12-18 user23
move07 08 2012-12-18 user25
move08 07 2012-12-17 user2
move08 07 2012-12-17 user24
move08 07 2012-12-17 user26
move09 06 2012-12-16 user2
move09 06 2012-12-16 user25
move09 06 2012-12-16 user27
move10 05 2012-12-15 user2
move10 05 2012-12-15 user26
move10 05 2012-12-15 user28
move11 04 2012-12-14 user2
move11 04 2012-12-14 user27
move11 04 2012-12-14 user29
move12 03 2012-12-13 user2
move12 03 2012-12-13 user28
move12 03 2012-12-13 user30
move13 02 2012-12-12 user2
move13 02 2012-12-12 user29
move13 02 2012-12-12 user31
move14 01 2012-12-11 user2
move14 01 2012-12-11 user30
move14 01 2012-12-11 user32
代码实现
电影Bean
package hadoop.mr.custom.combiner;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class MovieBean implements WritableComparable<MovieBean> {
private String moviename;
private Integer score;
private String date;
private String username;
public String getMoviename() {
return moviename;
}
public void setMoviename(String moviename) {
this.moviename = moviename;
}
public Integer getScore() {
return score;
}
public void setScore(Integer score) {
this.score = score;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getUsername() {
return username;
}
public void setUsername(String username) {
this.username = username;
}
/**
* 输出结果的排序方法
* @param o
* @return
*/
@Override
public int compareTo(MovieBean o) {
return this.username.compareTo(o.username);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(moviename);
out.writeInt(score);
out.writeUTF(date);
out.writeUTF(username);
}
@Override
public void readFields(DataInput in) throws IOException {
moviename=in.readUTF();
score=in.readInt();
date=in.readUTF();
username=in.readUTF();
}
@Override
public String toString() {
return "MovieBean{" +
"moviename='" + moviename + '\'' +
", score=" + score +
", date='" + date + '\'' +
", username='" + username + '\'' +
'}';
}
}
自定义组件combiner
package hadoop.mr.custom.combiner;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
/**
* MapReducer自定义组件 Combiner
* Combiner 是一个特殊的Reduce组件 ,它处于Mapper和Reduce中间的一种组件,Combiner组件的父类就Reducer
* Combiner和Reducer之间的区别在于运行的位置 ,Reducer是每一个接收全局的Map Task 所输出的结果,Combiner一般是在MapTask的节点中运行.
*
* MovieCombiner 类实现
*/
public class MovieCombiner extends Reducer<Text, MovieBean, Text,MovieBean> {
//排序集合 降序排序
TreeMap<Integer, MovieBean> treeMap = new TreeMap<Integer, MovieBean>(Comparator.reverseOrder());
/**
* 实现排名功能
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<MovieBean> values, Context context) throws IOException, InterruptedException {
//迭代遍历
Iterator<MovieBean> iterator = values.iterator();
while (iterator.hasNext()) {
MovieBean movieBean = iterator.next();
MovieBean bean = new MovieBean();
bean.setMoviename(movieBean.getMoviename());
bean.setDate(movieBean.getDate());
bean.setScore(movieBean.getScore());
bean.setUsername(movieBean.getUsername());
if (bean!=null){
treeMap.put(bean.getScore(),bean);
}
}
//TODO 只保留前十条数据,清除其他数据
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
for (int i = 0; i < 9; i++) {
for (Map.Entry<Integer, MovieBean> entry:treeMap.entrySet()) {
MovieBean movieBean = entry.getValue();
MovieBean bean = new MovieBean();
bean.setMoviename(movieBean.getMoviename());
bean.setDate(movieBean.getDate());
bean.setScore(movieBean.getScore());
bean.setUsername(movieBean.getUsername());
context.write(new Text(entry.getValue().getUsername()),bean);
}
}
super.cleanup(context);
}
}
Mapper程序
package hadoop.mr.custom.combiner;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class MovieMapper extends Mapper<LongWritable,Text,Text,MovieBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
if (key.get()==0){
return;
}
String[] values = value.toString().split("\\s");
MovieBean bean = new MovieBean();
bean.setMoviename(values[0]);
bean.setScore(Integer.parseInt(values[1]));
bean.setDate(values[2]);
bean.setUsername(values[3]);
context.write(new Text(bean.getUsername()),bean);
}
}
Reducer程序
package hadoop.mr.custom.combiner;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class MovieReducer extends Reducer<Text,MovieBean, Text,MovieBean> {
@Override
protected void reduce(Text key, Iterable<MovieBean> values, Context context) throws IOException, InterruptedException {
Iterator<MovieBean> iterator = values.iterator();
while (iterator.hasNext()) {
MovieBean bean = iterator.next();
context.write(key, bean);
}
}
}
mapreducer
主入口
package hadoop.mr.custom.combiner;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import util.FolderUtil;
public class MovieDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance();
job.setJobName(MovieDriver.class.getName());
job.setJarByClass(MovieDriver.class);
job.setMapperClass(MovieMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(MovieBean.class);
job.setReducerClass(MovieReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(MovieBean.class);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//设置自定义组件 Combiner
job.setCombinerClass(MovieCombiner.class);
System.out.println(job.waitForCompletion(true) ? "执行成功" : "执行失败");
return 0;
}
public static void main(String[] args) throws Exception {
args=new String[]{"D:\\BigData\\hadoop\\mr\\custom\\combiner\\input","D:\\BigData\\hadoop\\mr\\custom\\combiner\\output"};
MovieDriver movieDriver = new MovieDriver();
FolderUtil.delFolder(args[1]);
movieDriver.run(args);
}
}