12 mapreduce分区——partitioner

分区的业务场景

在平常进行统计的时候,不仅仅会统计一个总数,可能还需要根据不同的维度进行汇总,比如SQL中的group by子句,先对数据进行分组,然后在组内进行汇总整合。

所以mapreduce提供的partitioner功能就是对数据先进行分区,然后在每个分区上在进行数据统计。

在mr中,分区是从0开始的,有多少个分区,就对应了多少个reduceTask任务(其实也就是一个线程),每个reduceTask任务会输出一个结果文件part-r-0000x

分区的几个特点

1、在MapReduce中,分区默认是从0开始依次递增
2、在MapReduce中,每一个分区需要启动对应一个ReduceTask(Reduce产生的线程),每一个ReduceTask都会对应一个结果文件
3、如果不指定分区,默认使用的分区类时HashPartitioner,也就是对键取hashcode值后,转换成正数,然后对设定的reduceTask数量取余。
默人的numReduceTasks =1,所以取余的结果一定是0,一定都在一个分区上面,最后也只会产生一个结果文件

public class HashPartitioner<K2, V2> implements Partitioner<K2, V2> {

  public void configure(JobConf job) {}

  /** Use {@link Object#hashCode()} to partition. */
  public int getPartition(K2 key, V2 value,
                          int numReduceTasks) {
     "求&的原因是将hashCode值转成一个整数"
    return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
  }

}

自动义分区类

1)继承Partitoner接口,重写分区规则。
getPartitoner返回几,就表示分到哪个分区上
2)在Driver中设置分区类

测试案例

需求:不只是按照人名的维度统计流量使用情况,同时按照 地区的维度统计流量的使用情况
分析:所以不可以直接在一起统计,而是需要根据不同的区进行分区的划分,然后通过不同的reduceTask任务进行处理。
在这里插入图片描述

项目没有加入分区时的编码
package flow;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Flow implements Writable {
    private String phone = "";
    private String city = "";
    private String name = "";
    private int flow;

    public String getPhone() {
        return phone;
    }

    public void setPhone(String phone) {
        this.phone = phone;
    }

    public String getCity() {
        return city;
    }

    public void setCity(String city) {
        this.city = city;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getFlow() {
        return flow;
    }

    public void setFlow(int flow) {
        this.flow = flow;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        //第一个写出电话
        dataOutput.writeUTF(this.phone);
        dataOutput.writeUTF(this.city);
        dataOutput.writeUTF(this.name);
        dataOutput.writeInt(this.flow);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        //反序列时
        this.phone = dataInput.readUTF();
        this.city = dataInput.readUTF();
        this.name = dataInput.readUTF();
        this.flow = dataInput.readInt();
    }
}

package flow;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class SerialFlowMapper extends Mapper<LongWritable, Text,Text,Flow> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] s = value.toString().split(" ");
        Flow f = new Flow();
        f.setPhone(s[0]);
        f.setCity(s[1]);
        f.setName(s[2]);
        f.setFlow(Integer.parseInt(s[3]));
        context.write(new Text(f.getName()),f);
    }
}

package flow;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class SerialFlowReducer extends Reducer<Text,Flow,Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<Flow> values, Context context) throws IOException, InterruptedException {
        int flow = 0;
        for(Flow f : values){
            flow += f.getFlow();
        }
        context.write(key,new IntWritable(flow));
    }
}

package flow;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class SerialFlowDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(SerialFlowDriver.class);
        job.setMapperClass(SerialFlowMapper.class);
        job.setReducerClass(SerialFlowReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Flow.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job,new Path("hdfs://hadoop01:9000/txt/flow.txt"));
        FileOutputFormat.setOutputPath(job,new Path("hdfs://hadoop01:9000/result/flow"));
        job.waitForCompletion(true);
    }
}

引入自定义分区

实现能够按照地区统计流量使用情况

package flow;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
//泛型:map输出的键-值 类型
public class CityPartitioner extends Partitioner<Text,Flow> {
    @Override
    public int getPartition(Text text, Flow flow, int numPartitions) {
        String city = flow.getCity();
        if("bj".equals(city))
            return 0;
        if("sh".equals(city))
            return 1;
        return 2;
    }
}

修改Driver

设置分区类
设置reduceTask数量

package cn.tedu.partflow;

import cn.tedu.invert.InvertDriver;
import cn.tedu.invert.InvertMapper;
import cn.tedu.invert.InvertReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class PartFlowDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(PartFlowDriver.class);
        job.setMapperClass(PartFlowMapper.class);
        job.setReducerClass(PartFlowReducer.class);

        // 指定分区类
        job.setPartitionerClass(CityPartitioner.class);
        // 每一个分区对应一个ReduceTask
        // 现在设置了3个分区,那么就需要设置3个ReduceTask
        job.setNumReduceTasks(3);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Flow.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.addInputPath(job,
                new Path("hdfs://hadoop01:9000/txt/flow.txt"));
        FileOutputFormat.setOutputPath(job,
                new Path("hdfs://hadoop01:9000/result/partflow"));

        job.waitForCompletion(true);
    }
}

按月份统计学生的总成绩

目录:score1
Map输出的键值对:学生名,Score对象

封装一个model

分数对象

package partiScore;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Score implements Writable {
    private int month;
    private String name = "";
    private int score;

    public int getMonth() {
        return month;
    }

    public void setMonth(int month) {
        this.month = month;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getScore() {
        return score;
    }

    public void setScore(int score) {
        this.score = score;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeInt(this.month);
        dataOutput.writeUTF(this.name);
        dataOutput.writeInt(this.score);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.month = dataInput.readInt();
        this.name = dataInput.readUTF();
        this.score = dataInput.readInt();
    }
}

mapper
package partiScore;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class SumMapper extends Mapper<LongWritable, Text,Text,Score> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1 zhang 89
        String[] s = value.toString().split(" ");
        Score score = new Score();
        score.setMonth(Integer.parseInt(s[0]));
        score.setName(s[1]);
        score.setScore(Integer.parseInt(s[2]));
        context.write(new Text(score.getName()),score);
    }
}

reducer
package partiScore;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class SumReducer extends Reducer<Text,Score, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<Score> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        for(Score s:values){
            sum+=s.getScore();
        }
        context.write(key,new IntWritable(sum));
    }
}

Partitioner
package partiScore;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class MonthPartitioner extends Partitioner<Text,Score> {

    @Override
    public int getPartition(Text text, Score score, int numPartitions) {
        return score.getMonth()-1;
    }
}

Driver
package partiScore;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class SumDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Job job= Job.getInstance(new Configuration());
        job.setJarByClass(SumDriver.class);
        job.setMapperClass(SumMapper.class);
        job.setReducerClass(SumReducer.class);
        job.setPartitionerClass(MonthPartitioner.class);
        job.setNumReduceTasks(3);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Score.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job,new Path("hdfs://hadoop01:9000/txt/score1/"));
        FileOutputFormat.setOutputPath(job,new Path("hdfs://hadoop01:9000/result/partMonthScore"));
        job.waitForCompletion(true);
    }
}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值