mapreduce之Partition分区

一抹鱼肚白

已于 2023-03-06 14:39:23 修改

阅读量127

点赞数

分类专栏：大数据文章标签： mapreduce 大数据

于 2023-03-06 14:33:47 首次发布

本文链接：https://blog.csdn.net/weixin_43652205/article/details/129361678

版权

大数据专栏收录该内容

70 篇文章 3 订阅

订阅专栏

该文展示了如何在HadoopMapReduce中创建一个自定义分区器ProvincePartitioner，根据电话号码前三位进行分区，并设置了相应的ReduceTask数量。Mapper读取数据，Reducer进行流量聚合。

摘要由CSDN通过智能技术生成

方法

自定义类继承Partitioner，重写getPartition()方法
在Job驱动中，设置自定义Partitioner
根据自定义Partition分区设置相应数量的ReduceTask

ProvincePartitioner

package com.mingyu.mapreduce.partitioner2;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class ProvincePartitioner extends Partitioner<Text,FlowBean> {
    @Override
    public int getPartition(Text text, FlowBean flowBean, int numPartitions) {

        // text手机号
        String phone = text.toString();

        String prePhone = phone.substring(0, 3);

        int partitiion;

        if("136".equals(prePhone)){
            partitiion = 0;
        }else if ("137".equals(prePhone)){
            partitiion = 1;
        }else if ("138".equals(prePhone)){
            partitiion = 2;
        }else if ("139".equals(prePhone)){
            partitiion = 3;
        }else{
            partitiion = 4;
        }

        return partitiion;
    }
}

Driver

package com.mingyu.mapreduce.partitioner2;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class FlowDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        // 1、获取job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        // 2、设置jar
        job.setJarByClass(FlowBean.class);

        // 3、关联mapper、reducer
        job.setMapperClass(FlowMapper.class);
        job.setReducerClass(FlowReducer.class);

        // 4、设置mapper输出的key和value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        // 5、设置最终输出的key和value类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);

        // 设置输出分区
        job.setPartitionerClass(ProvincePartitioner.class);
        job.setNumReduceTasks(5);

        // 6、设置数据的输入和输出路径
        FileInputFormat.setInputPaths(job, new Path("D:\\QQ_file\\bigdatda\\note_info\\hadoop3.3\\ziliao\\11_input\\inputflow"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\QQ_file\\bigdatda\\note_info\\hadoop3.3\\output\\output_partition_phone"));

        // 7、提交job
        boolean result = job.waitForCompletion(true);

        System.exit(result ? 0 : 1);
    }
}

FLowBean

package com.mingyu.mapreduce.partitioner2;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 1、定义的类实现Writable接口
 * 2、重写序列化和反序列化方法
 * 3、重写构造方法
 * 4、toString方法
 */
public class FlowBean implements Writable {
    private long upFLow;
    private long downFlow;
    private long sumFlow;

    public FlowBean() {
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeLong(upFLow);
        out.writeLong(downFlow);
        out.writeLong(sumFlow);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.upFLow = in.readLong();
        this.downFlow = in.readLong();
        this.sumFlow = in.readLong();
    }

    public long getUpFLow() {
        return upFLow;
    }

    public void setUpFLow(long upFLow) {
        this.upFLow = upFLow;
    }

    public long getDownFlow() {
        return downFlow;
    }

    public void setDownFlow(long downFlow) {
        this.downFlow = downFlow;
    }

    public long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow() {
        this.sumFlow = this.upFLow + this.downFlow;
    }

    @Override
    public String toString() {
        return upFLow + "\t" + downFlow + "\t" + sumFlow;
    }
}

Mapper

package com.mingyu.mapreduce.partitioner2;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class FlowMapper extends Mapper<LongWritable, Text,Text, FlowBean> {

    private Text outK = new Text();
    private FlowBean outV = new FlowBean();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        // 1、获取一行
        String line = value.toString();

        // 2、行内切割
        String[] split = line.split("\t");

        // 3、获取数据
        String phone = split[1];
        String upFlow = split[split.length - 3];
        String downFlow = split[split.length - 2];

        // 4、设置数据
        outK.set(phone);
        outV.setUpFLow(Long.parseLong(upFlow));
        outV.setDownFlow(Long.parseLong(downFlow));
        outV.setSumFlow();

        //5、封装
        context.write(outK,outV);
    }
}

Reducer

package com.mingyu.mapreduce.partitioner2;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class FlowReducer extends Reducer<Text, FlowBean,Text, FlowBean> {

    private FlowBean outV = new FlowBean();

    @Override
    protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {

        long totalUp = 0;
        long totalDown = 0;
        for (FlowBean value : values) {
            totalUp += value.getUpFLow();
            totalDown += value.getDownFlow();
        }


        outV.setUpFLow(totalUp);
        outV.setDownFlow(totalDown);
        outV.setSumFlow();


        context.write(key,outV);
    }
}