需求
求下述数据中,总流量使用前10的用户信息。字段分别为:手机号、上行流量、下行流量、总流量。
13470253144 180 180 360
13509468723 7335 110349 117684
13560439638 918 4938 5856
13568436656 3597 25635 29232
13590439668 1116 954 2070
13630577991 6960 690 7650
13682846555 1938 2910 4848
13729199489 240 0 240
13736230513 2481 24681 27162
13768778790 120 120 240
13846544121 264 0 264
13956435636 132 1512 1644
13966251146 240 0 240
13975057813 11058 48243 59301
13992314666 3008 3720 6728
15043685818 3659 3538 7197
15910133277 3156 2936 6092
15959002129 1938 180 2118
18271575951 1527 2106 3633
18390173782 9531 2412 11943
84188413 4116 1432 5548
分析
Map端:JavaBean封装数据。
Reduce端:全局排序,取前十。
需要解决的问题:Reduce端只能启动一个reducetask处理数据才能做到全局排序,如果数据量过大,那么reducetask的效率极低。
如何解决上述问题?
reducetask只需要输出前10的数据,我们可以先从各maptask中获取各自的前10数据,reduce端接收到的数据只有各maptask中前10的数据,reducetask再比较这些数据,获取全局前10的数据。
借助combiner,筛选出各maptask前10的数据,再输出给reduce。
代码实现
1.创建FlowBean,用于封装数据,并按照总流量降序。
package com.aura.hadoop.topN;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @author panghu
* @description
* @create 2021-02-18-11:32
*/
public class FlowBean implements WritableComparable<FlowBean>{
//13470253144 180 180 360
private String phone;
private Long upFlow;
private Long downFlow;
private Long sumFlow;
public void set(Long upFlow,Long downFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.sumFlow = upFlow + downFlow;
}
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public Long getUpFlow() {
return upFlow;
}
public void setUpFlow(Long upFlow) {
this.upFlow = upFlow;
}
public Long getDownFlow() {
return downFlow;
}
public void setDownFlow(Long downFlow) {
this.downFlow = downFlow;
}
public Long getSumFlow() {
return sumFlow;
}
public void setSumFlow(Long sumFlow) {
this.sumFlow = sumFlow;
}
@Override
public String toString() {
return phone + "\t" + upFlow + "\t" + downFlow + "\t" + sumFlow;
}
/**
* 按照总流量降序
* @param o
* @return
*/
@Override
public int compareTo(FlowBean o) {
return Long.compare(o.getSumFlow(), this.getSumFlow());
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(phone);
out.writeLong(upFlow);
out.writeLong(downFlow);
out.writeLong(sumFlow);
}
@Override
public void readFields(DataInput in) throws IOException {
this.phone = in.readUTF();
this.upFlow = in.readLong();
this.downFlow = in.readLong();
this.sumFlow = in.readLong();
}
}
2.创建Mapper类,封装FlowBean。
package com.aura.hadoop.topN;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @author panghu
* @description
* @create 2021-02-18-11:33
*/
public class TopNMapper extends Mapper<LongWritable,Text,FlowBean,NullWritable>{
private FlowBean fb = new FlowBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
fb.setPhone(split[0]);
fb.set(Long.parseLong(split[1]), Long.parseLong(split[2]));
context.write(fb, NullWritable.get());
}
}
3.创建分组比较器,将所有数据分到同一组
package com.aura.hadoop.topN;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* @author panghu
* @description
* @create 2021-02-18-11:48
*/
public class TopNGrouping extends WritableComparator{
// 声明比较类型
public TopNGrouping() {
super(FlowBean.class,true);
}
// 所有的数据都到一个组才能进行全局排序
@Override
public int compare(WritableComparable a, WritableComparable b) {
return 0;
}
}
4.创建reducer类,取数据前10。我们可以用此reducer类同时当作combiner比较器。
package com.aura.hadoop.topN;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
/**
* @author panghu
* @description
* @create 2021-02-18-11:33
*/
public class TopNReducer extends Reducer<FlowBean, NullWritable, FlowBean, NullWritable> {
@Override
protected void reduce(FlowBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
// 取出总流量前十的数据
Iterator<NullWritable> iterator = values.iterator();
for (int i = 0; i < 10; i++) {
if (iterator.hasNext()) {
context.write(key, iterator.next()); // 一定要迭代下一条数据
}
}
}
}
5.创建driver类,指定分组比较器和combiner比较器。
package com.aura.hadoop.topN;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author panghu
* @description
* @create 2021-02-18-11:33
*/
public class TopNDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(TopNDriver.class);
job.setMapperClass(TopNMapper.class);
job.setReducerClass(TopNReducer.class);
// 指定分组比较器
job.setGroupingComparatorClass(TopNGrouping.class);
// 先combiner,取各maptask前十再输出,减少reduce工作量
job.setCombinerClass(TopNReducer.class);
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("D:\\data\\hadoopdata\\topN"));
FileOutputFormat.setOutputPath(job, new Path("D:\\data\\out\\topN_out"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}