MR之TopN原理分析与实现

需求

求下述数据中,总流量使用前10的用户信息。字段分别为:手机号、上行流量、下行流量、总流量。

13470253144	180	180	360
13509468723	7335	110349	117684
13560439638	918	4938	5856
13568436656	3597	25635	29232
13590439668	1116	954	2070
13630577991	6960	690	7650
13682846555	1938	2910	4848
13729199489	240	0	240
13736230513	2481	24681	27162
13768778790	120	120	240
13846544121	264	0	264
13956435636	132	1512	1644
13966251146	240	0	240
13975057813	11058	48243	59301
13992314666	3008	3720	6728
15043685818	3659	3538	7197
15910133277	3156	2936	6092
15959002129	1938	180	2118
18271575951	1527	2106	3633
18390173782	9531	2412	11943
84188413	4116	1432	5548

分析

Map端:JavaBean封装数据。
Reduce端:全局排序,取前十。
需要解决的问题:Reduce端只能启动一个reducetask处理数据才能做到全局排序,如果数据量过大,那么reducetask的效率极低。

如何解决上述问题?

reducetask只需要输出前10的数据,我们可以先从各maptask中获取各自的前10数据,reduce端接收到的数据只有各maptask中前10的数据,reducetask再比较这些数据,获取全局前10的数据。
借助combiner,筛选出各maptask前10的数据,再输出给reduce。

代码实现

1.创建FlowBean,用于封装数据,并按照总流量降序。

package com.aura.hadoop.topN;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;


/**
 * @author panghu
 * @description
 * @create 2021-02-18-11:32
 */
public class FlowBean implements WritableComparable<FlowBean>{
    //13470253144	180	180	360
    private String phone;
    private Long upFlow;
    private Long downFlow;
    private Long sumFlow;

    public void set(Long upFlow,Long downFlow) {
        this.upFlow = upFlow;
        this.downFlow = downFlow;
        this.sumFlow = upFlow + downFlow;
    }

    public String getPhone() {
        return phone;
    }

    public void setPhone(String phone) {
        this.phone = phone;
    }

    public Long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(Long upFlow) {
        this.upFlow = upFlow;
    }

    public Long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(Long downFlow) {
        this.downFlow = downFlow;
    }

    public Long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(Long sumFlow) {
        this.sumFlow = sumFlow;
    }

    @Override
    public String toString() {
        return phone + "\t" + upFlow + "\t" + downFlow + "\t" + sumFlow;
    }

    /**
     * 按照总流量降序
     * @param o
     * @return
     */
    @Override
    public int compareTo(FlowBean o) {
        return Long.compare(o.getSumFlow(), this.getSumFlow());
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(phone);
        out.writeLong(upFlow);
        out.writeLong(downFlow);
        out.writeLong(sumFlow);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.phone = in.readUTF();
        this.upFlow = in.readLong();
        this.downFlow = in.readLong();
        this.sumFlow = in.readLong();
    }
}

2.创建Mapper类,封装FlowBean。

package com.aura.hadoop.topN;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @author panghu
 * @description
 * @create 2021-02-18-11:33
 */
public class TopNMapper extends Mapper<LongWritable,Text,FlowBean,NullWritable>{
    private FlowBean fb = new FlowBean();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] split = value.toString().split("\t");
        fb.setPhone(split[0]);
        fb.set(Long.parseLong(split[1]), Long.parseLong(split[2]));
        context.write(fb, NullWritable.get());
    }
}

3.创建分组比较器,将所有数据分到同一组

package com.aura.hadoop.topN;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * @author panghu
 * @description
 * @create 2021-02-18-11:48
 */
public class TopNGrouping extends WritableComparator{
    // 声明比较类型
    public TopNGrouping() {
        super(FlowBean.class,true);
    }

    // 所有的数据都到一个组才能进行全局排序
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        return 0;
    }
}

4.创建reducer类,取数据前10。我们可以用此reducer类同时当作combiner比较器。

package com.aura.hadoop.topN;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

/**
 * @author panghu
 * @description
 * @create 2021-02-18-11:33
 */
public class TopNReducer extends Reducer<FlowBean, NullWritable, FlowBean, NullWritable> {
    @Override
    protected void reduce(FlowBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        // 取出总流量前十的数据
        Iterator<NullWritable> iterator = values.iterator();
        for (int i = 0; i < 10; i++) {
            if (iterator.hasNext()) {
                context.write(key, iterator.next()); // 一定要迭代下一条数据
            }
        }
    }
}

5.创建driver类,指定分组比较器和combiner比较器

package com.aura.hadoop.topN;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @author panghu
 * @description
 * @create 2021-02-18-11:33
 */
public class TopNDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Job job = Job.getInstance(new Configuration());

        job.setJarByClass(TopNDriver.class);

        job.setMapperClass(TopNMapper.class);
        job.setReducerClass(TopNReducer.class);
        // 指定分组比较器
        job.setGroupingComparatorClass(TopNGrouping.class);
        // 先combiner,取各maptask前十再输出,减少reduce工作量
        job.setCombinerClass(TopNReducer.class);

        job.setMapOutputKeyClass(FlowBean.class);
        job.setMapOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path("D:\\data\\hadoopdata\\topN"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\out\\topN_out"));

        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值