Combiner:
数据样例:
1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.42052.flash2-http.qq.com 综合门户 15 12 1938 2910 200 1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.42052.flash2-http.qq.com 综合门户 15 12 1938 2910 200 1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.42052.flash2-http.qq.com 综合门户 15 12 1938 2910 200 1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.42052.flash2-http.qq.com 综合门户 15 12 1938 2910 200 字段释义: 时间戳 ts long 手机号 phone String 基站编号 Id String IP Ip String url url String url 类型 Type String 发送数据包 Send Int 接受数据包 Receive Int 上行流量 upflow Long 下行流量 downflow Long 响应 Status String |
Combiner 是 MapReduce 程序中 Mapper 和 Reducer 之外的一种组件,它的作用是在 maptask之后给 maptask 的结果进行局部汇总,以减轻 reducetask 的计算负载,减少网络传输。
Combiner 和 Reducer 一样,编写一个类,然后继承 Reducer,reduce 方法中写具体的 Combiner逻辑,然后在 job 中设置 Combiner 组件:job.setCombinerClass(FlowSumCombine.class)
package flow;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FlowExercise1MRWithCombiner {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
System.setProperty("HADOOP_USER_NAME", "hadoop");
Job job = Job.getInstance(conf);
job.setJarByClass(FlowExercise1MRWithCombiner.class);
job.setMapperClass(FlowExercise1MRMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(FlowExercise1MRReducer.class);
// 如果reducer阶段的输出的key-value的类型和mapper阶段的一致,那么可以省略前面的setMapOutClass()
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
/**
* 在默认的模型实现中是没有自带的Combiner组件的
* 如果没有指定,表示不适用Combiner
*
* 一般来说,max,min,sum,count 等逻辑可以使用Reducer直接充当Combiner
* 如果Reducer逻辑的输入和输出的key-value的类型一致的话,也基本可用
*
* 输入: key:Text, value : LongWritable
* 输出: key:Text, value : IntWritable
*
* avg是不能使用的。!!!!!!!!!!!
*
*/
// job.setCombinerClass(FlowExercise1MRCombiner.class);
job.setCombinerClass(FlowExercise1MRReducer.class);
Path inputPath = new Path(args[0]);
Path outputPath = new Path(args[1]);
FileInputFormat.setInputPaths(job, inputPath);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outputPath)){
fs.delete(outputPath,true);
}
FileOutputFormat.setOutputPath(job, outputPath);
boolean isDone = job.waitForCompletion(true);
System.exit(isDone ? 0 : 1);
}
/**
* Mapper阶段的业务逻辑
*/
private static class FlowExercise1MRMapper extends Mapper<LongWritable, Text, Text, Text>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] split = value.toString().split(" ");
String phone = split[1];
String upFlow = split[8];
String downFlow = split[9];
context.write(new Text(phone), new Text(upFlow + "--" + downFlow));
}
}
/**
* Reducer阶段的业务逻辑
*/
private static class FlowExercise1MRReducer extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
long sumUpFlow = 0;
long sumDownFlow = 0;
for(Text t : values){
String[] split = t.toString().split("--");
long upFlow = Long.parseLong(split[0]);
long downFlow = Long.parseLong(split[1]);
sumUpFlow += upFlow;
sumDownFlow += downFlow;
}
long sumFlow = sumUpFlow + sumDownFlow;
context.write(key, new Text(sumUpFlow+"\t"+sumDownFlow+"\t"+sumFlow));
}
}
private static class FlowExercise1MRCombiner extends Reducer<Text, Text, Text, Text>{
/**
* key : phone
* value : upFlow + "--" + downFlow
*
*/
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
long sumUpFlow = 0;
long sumDownFlow = 0;
for(Text t : values){
String[] split = t.toString().split("--");
long upFlow = Long.parseLong(split[0]);
long downFlow = Long.parseLong(split[1]);
sumUpFlow += upFlow;
sumDownFlow += downFlow;
}
context.write(key, new Text(sumUpFlow + "--" + sumDownFlow));
}
}
}
注意事项:
1、Combiner 和 Reducer 的区别在于运行的位置:
Combiner 是在每一个 maptask 所在的节点运行
Reducer 是接收全局所有 Mapper 的输出结果
2、Combiner 的输出 kv 应该跟 reducer 的输入 kv 类型要对应起来
3、Combiner 的使用要非常谨慎,因为 Combiner 在 MapReduce 过程中可能调用也可能不调用,可能调一次也可能调多次。
所以:Combiner 使用的原则是:有或没有都不能影响业务逻辑,都不能影响最终结果