MapReduce 序列化、二次排序、自定义分区、虚拟存储切片(CombineTextInputFormat)——综合案例

AdamShyly

已于 2022-04-11 17:21:53 修改

阅读量961

点赞数

分类专栏： hadoop 文章标签： hadoop

于 2022-04-10 02:58:04 首次发布

本文链接：https://blog.csdn.net/Adam_captain/article/details/124071857

版权

hadoop 专栏收录该内容

26 篇文章 0 订阅

订阅专栏

FlowBean.java

package com.atguigu.mapreduce.partitionerandWritableComparable;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 *  1. 定义类实现Writable接口
 *  2. 重写序列化和反序列化方法
 *  3. 重写空参构造
 *  4. toString方法
 */

public class FlowBean implements WritableComparable<FlowBean> {

    private long upFlow;  // 上行流量
    private long downFlow;  // 下行流量
    private long sumFlow;  // 总流量

    // 空参构造
    public FlowBean() {
    }

    public long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(long upFlow) {
        this.upFlow = upFlow;
    }

    public long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(long downFlow) {
        this.downFlow = downFlow;
    }

    public long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow() {
        this.sumFlow = this.upFlow + this.downFlow;
    }

    @Override
    public void write(DataOutput out) throws IOException {

        out.writeLong(upFlow);
        out.writeLong(downFlow);
        out.writeLong(sumFlow);

    }

    @Override
    public void readFields(DataInput in) throws IOException {

        this.upFlow = in.readLong();
        this.downFlow = in.readLong();
        this.sumFlow = in.readLong();

    }

    @Override
    public String toString() {
        return upFlow + "\t" + downFlow + "\t" + sumFlow;
    }


    @Override
    public int compareTo(FlowBean o) {
        // 总流量的倒序排序
        if (this.sumFlow > o.sumFlow) {
            return -1;
        } else if (this.sumFlow < o.sumFlow) {
            return 1;
        } else {
            // 上行流量正序
            if (this.upFlow > o.upFlow) {
                return 1;
            } else if (this.downFlow < o.downFlow) {
                return -1;
            } else {
                return 0;
            }
        }
    }
}

FlowDriver.java

package com.atguigu.mapreduce.partitionerandWritableComparable;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class FlowDriver {

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "flowCal");

        job.setJarByClass(FlowDriver.class);

        job.setMapperClass(FlowMapper.class);
        job.setReducerClass(FlowReducer.class);

        job.setMapOutputKeyClass(FlowBean.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);

        // 如果不设置InputFormat, 它默认用的是TextInputFormat.class
        job.setInputFormatClass(CombineTextInputFormat.class);

        // 虚拟存储切片最大值设置为4M
        CombineTextInputFormat.setMaxInputSplitSize(job, 4194304);

        job.setPartitionerClass(ProvincePartitioner2.class);
        job.setNumReduceTasks(5);

        FileInputFormat.setInputPaths(job, new Path("D:\\hadoop\\output4"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\hadoop\\output6"));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

FlowMapper.java

package com.atguigu.mapreduce.partitionerandWritableComparable;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class FlowMapper extends Mapper<LongWritable, Text, FlowBean, Text> {

    private FlowBean outK = new FlowBean();
    private Text outV = new Text();

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, FlowBean, Text>.Context context) throws IOException, InterruptedException {

        // 获取一行
        String line = value.toString();

        // 切割
        String[] split = line.split("\t");

        // 封装
        outV.set(split[0]);
        outK.setUpFlow(Long.parseLong(split[1]));
        outK.setDownFlow(Long.parseLong(split[2]));
        outK.setSumFlow();

        context.write(outK, outV);

    }
}

FlowReducer.java

package com.atguigu.mapreduce.partitionerandWritableComparable;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class FlowReducer extends Reducer<FlowBean, Text, Text, FlowBean> {

    @Override
    protected void reduce(FlowBean key, Iterable<Text> values, Reducer<FlowBean, Text, Text, FlowBean>.Context context) throws IOException, InterruptedException {
        for (Text value : values) {
            context.write(value, key);
        }
    }

}

ProvincePartitioner2.java

package com.atguigu.mapreduce.partitionerandWritableComparable;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class ProvincePartitioner2 extends Partitioner<FlowBean, Text> {

    private int partition;

    @Override
    public int getPartition(FlowBean flowBean, Text text, int numPartitions) {

        String phone = text.toString();

        String prePhone = phone.substring(0, 3);

        if ("136".equals(prePhone)) {
            partition = 0;
        } else if ("137".equals(prePhone)) {
            partition = 1;
        } else if ("138".equals(prePhone)) {
            partition = 2;
        } else if ("139".equals(prePhone)) {
            partition = 3;
        } else {
            partition = 4;
        }
        return  partition;

    }

}

源数据文件

输出文件

AdamShyly

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
MapReduce 序列化、二次排序、自定义分区、虚拟存储切片(CombineTextInputFormat)——综合案例

FlowBean.javapackage com.atguigu.mapreduce.partitionerandWritableComparable;import org.apache.hadoop.io.WritableComparable;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;/** * 1. 定义类实现Writable接口 * 2. 重写序列化和反序列.
复制链接

扫一扫

专栏目录