H(hadoop&code).Hadoop_MapReduce wordCount_FlowBean（partition和comparable）程序编写

蒸气awa

已于 2022-08-07 15:26:34 修改

阅读量286

点赞数 2

分类专栏：大数据—Hadoop 文章标签： hadoop 大数据 hdfs

于 2022-07-31 23:37:05 首次发布

本文链接：https://blog.csdn.net/wq45255446/article/details/126092363

版权

大数据—Hadoop 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

一、重写三个方法Mapper、Reducer和Driver

(1)wordCount_FlowBean

Mapper、Reducer、FlowBean；

(2)wordCount_FlowBean&partition

Mapper、Reducer、FlowBean、partitioner；

(3)wordCount_FlowBean&partition&comparable

Mapper、Reducer、FlowBean、partitioner在FlowBean中重写compareTo方法；

二、对于Mapper


import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;

public class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {

    private Text outK = new Text();
    private  FlowBean outV = new FlowBean();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //1.获取一行
        // 1	13736230513	192.196.100.1	www.atguigu.com	2481	24681	200
        String line = value.toString();

        //2.切割
        String[] words = line.split("\t");

        //3.抓取想要的数据
        //手机号 上行流量 下行流量
        String phone = words[1];
        String up = words[words.length-3];
        String down  = words[words.length-2];

        //4.封装
        outK.set(phone);
        outV.setUpFlow(Long.parseLong(up));
        outV.setDownFlow(Long.parseLong(down));
        outV.setSumFlow();

        //5.写出
        context.write(outK,outV);
    }

}

三、对于Reducer


import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;

public class FlowReducer extends Reducer<Text, FlowBean, Text, FlowBean> {

    private FlowBean outV = new FlowBean();

    @Override
    protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {

        //1.遍历集合,累加值
        long totalUp = 0;
        long totaldown = 0;
        for(FlowBean value : values){
            totalUp += value.getUpFlow();
            totaldown += value.getDownFlow();
        }

        //2.封装outV,outK
        outV.setUpFlow(totalUp);
        outV.setDownFlow(totaldown);
        outV.setSumFlow();

        //3.写出
        context.write(key, outV);
    }
}

四、对于driver


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;

public class FlowDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        //1.获取job对象
        Configuration conf = new Configuration();
        Job job = Job.getInstance();

        //2.设置jar
        job.setJarByClass(FlowDriver.class);

        //3.关联mapper和Reducer
        job.setMapperClass(FlowMapper.class);
        job.setReducerClass(FlowReducer.class);

        //4.设置mapper输出的key和value
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        //5.设置最终数据输出的key和value类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);

        //6.设置数据的输入路径和输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //7.提交job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);

    }
}

五、FlowBean


import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/*
* 1.定义类实现writable接口
* 2.重写序列化和反序列化方法
* 3.重写空参构造
* 4.toString方法
*
* */
public class FlowBean implements Writable {
    private long upFlow; //上行流量
    private long downFlow; //下行流量
    private long sumFlow; //总流量

    public long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(long upFlow) {
        this.upFlow = upFlow;
    }

    public long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(long downFlow) {
        this.downFlow = downFlow;
    }

    public long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(long sumFlow) {
        this.sumFlow = sumFlow;
    }
    public void setSumFlow() {
        this.sumFlow = this.upFlow + this.downFlow;
    }

    //空参构造
    public FlowBean() {
    }

    /*
    序列化和反序列化,数据顺序一一对应
     */
    @Override
    public void write(DataOutput out) throws IOException { //序列化

        out.writeLong(upFlow);
        out.writeLong(downFlow);
        out.writeLong(sumFlow);

    }

    @Override
    public void readFields(DataInput in) throws IOException { //反序列化

        this.upFlow = in.readLong();
        this.downFlow = in.readLong();
        this.sumFlow = in.readLong();

    }

    @Override
    public String toString() {
        return upFlow + "\t" + downFlow + "\t" + sumFlow;
    }
}

6.partitioner分区

(1)Partitioner


import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class ProvincePartitioner extends Partitioner<Text, FlowBean> {

    @Override
    public int getPartition(Text text, FlowBean flowBean, int i) {

        //text是手机号
        String phone = text.toString();
        String prePhone = phone.substring(0,3);

        int partition;

        if("136".equals(prePhone)){
            partition = 0;
        }else if("137".equals(prePhone)){
            partition = 1;
        }else if("138".equals(prePhone)){
            partition = 2;
        }else if("139".equals(prePhone)){
            partition = 3;
        }else{
            partition = 4;
        }

        return partition;

    }
}

(2)driver中添加如下代码

        job.setPartitionerClass(ProvincePartitioner.class);
        job.setNumReduceTasks(n);//n为分区数

(3)分区总结

a.如果ReduceTask的数量＞getPartition的结果数，则会多产生几个空的输出文件part-1-000xx;
b.如果1<ReduceTask的数量<getPartition的结果数，则有一部分分区数据无处安放，会Exception;
c.如果ReduceTask的数量=1，则不管MapTask端输出多少个分区文件，最终结果都交给这一个
ReduceTask，最终也就只会产生一个结果文件 part-1-00000；
d.分区号必须从零开始，逐一累加；

7.camparable

FlowBean重写WritableComparable

    @Override
    public int compareTo(FlowBean o) {

        //按照总流量比较,倒序排列
        if(this.sumFlow > o.sumFlow){
            return -1;
        }else if(this.sumFlow < o.sumFlow){
            return 1;
        }else {
            if(this.upFlow > o.upFlow){
                return -1;
            }else if (this.upFlow < o.upFlow){
                return 1;
            }else{
                return 0;
            }
        }
    }

参考尚硅谷Hadoop视频！原视频连接：http://www.atguigu.com/