Hadoop(12) MR Partitioner_hadoop mr partitioner 没有被执行-CSDN博客

本文链接：https://blog.csdn.net/zz657114506/article/details/52723562

回顾Map阶段五大步骤

其中，step1.3就是一个分区操作。通过前面的学习我们知道Mapper最终处理的键值对key/value，是需要送到Reducer去合并的，合并的时候，有相同key的键/值对会送到同一个Reducer节点中进行归并。哪个key到哪个Reducer的分配过程，是由Partitioner规定的。
Hadoop内置Partitioner
　　MapReduce的使用者通常会指定Reduce任务和Reduce任务输出文件的数量（R）。用户在中间key上使用分区函数来对数据进行分区，之后在输入到后续任务执行进程。一个默认的分区函数式使用hash方法（比如常见的：hash(key) mod R）进行分区。hash方法能够产生非常平衡的分区，鉴于此，Hadoop中自带了一个默认的分区类HashPartitioner，它继承了Partitioner类，提供了一个getPartition的方法，它的定义如下所示：

package org.apache.hadoop.mapreduce.lib.partition;  

import org.apache.hadoop.mapreduce.Partitioner;  

/** Partition keys by their {@link Object#hashCode()}. */
public class HashPartitioner<K, V> extends Partitioner<K, V> {
    /** Use {@link Object#hashCode()} to partition. */
    public int getPartition(K key, V value, int numReduceTasks) {
        // 默认使用key的hash值与上int的最大值，避免出现数据溢出 的情况
        return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
    }
} 
// 这段代码实现的目的是将key均匀分布在Reduce Tasks上，例如：如果Key为Text的话，
// Text的hashcode方法跟String的基本一致，都是采用的Horner公式计算，得到一个int整数。
// 但是，如果string太大的话这个int整数值可能会溢出变成负数，所以和整数的上限值Integer.MAX_VALUE（
// 即0111111111111111）进行与运算，然后再对reduce任务个数取余，这样就可以让key均匀分布在reduce上。

定制Partitioner
定制partitioner需继承Partitioner

package com.zz.hadoop.dc.mr;

import java.io.IOException;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.zz.hadoop.dc.factory.Factory;
import com.zz.hadoop.dc.po.DataInfo;

/**
 * 统计每个手机号使用的流量, 并按照手机号类型(移动、联通、电信)进行分区输出
 */
public class DataCount {

    public static void main(String[] args)
            throws IOException, ClassNotFoundException, InterruptedException {
        Configuration config = new Configuration();

        Job job = Job.getInstance(config);

        job.setJarByClass(DataCount.class);

        job.setMapperClass(DCMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(DataInfo.class);
        FileInputFormat.setInputPaths(job, new Path(args[0]));

        job.setReducerClass(DCReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DataInfo.class);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 设置分区类
        job.setPartitionerClass(DCPartitioner.class);

        // 设置启动reducer数量（若 reduce数量 >= partitioner数量, 则多余的分区为空;
        // 若 reduce数量 < partitioner数量, 则会出错。）
        job.setNumReduceTasks(4);

        job.waitForCompletion(true);
    }

    /** 每运行一次mapper就是一个mapper对象，不存在线程安全问题 */
    public class DCMapper extends Mapper<LongWritable, Text, Text, DataInfo> {
        // 每context.write()一次就已经序列化了, 不存在同一引用问题
        private Text k2 = new Text();

        @Override
        protected void map(LongWritable k1, Text v1, Context context)
                throws IOException, InterruptedException {
            String line = v1.toString();
            String[] fields = line.split("\t");
            // tel
            String tel = fields[1];
            // upPayLoad(上行流量)
            long up = Long.parseLong(fields[8]);
            // downPayLoad(下行流量)
            long dn = Long.parseLong(fields[9]);

            DataInfo data = new DataInfo(tel, up, dn);
            this.k2.set(tel);
            // context.write(new Text(tel), data);
            context.write(k2, data);
        }
    }

    /** Reduce */
    public class DCReduce extends Reducer<Text, DataInfo, Text, DataInfo> {
        @Override
        protected void reduce(Text k2, Iterable<DataInfo> v2s, Context context)
                throws IOException, InterruptedException {
            long up = 0;
            long dn = 0;
            for (DataInfo dataInfo : v2s) {
                up += dataInfo.getUpPayLoad();
                dn += dataInfo.getDownPayLoad();
            }
            context.write(k2, new DataInfo(k2.toString(), up, dn));
        }
    }

    /** 分区 */
    public static class DCPartitioner extends Partitioner<Text, DataInfo> {
        private static Map<String, Integer> provider = Factory.getMap();
        // 模拟移动、联通、电信分段号对应的分区码
        static {
            provider.put("138", 1);
            provider.put("139", 1);
            provider.put("155", 2);
            provider.put("156", 2);
            provider.put("180", 3);
            provider.put("181", 3);
        }

        @Override
        public int getPartition(Text k2, DataInfo v2, int numPartition) {
            String sub = k2.toString().substring(0, 3);
            Integer code = provider.get(sub);
            if (null == code) {
                code = 0;
            }

            return code;
        }
    }
}

// 封装对象(须implements Writable)
package com.zz.hadoop.dc.po;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class DataInfo implements Writable {
    private String tel;
    private long upPayLoad;
    private long downPayLoad;
    private long totalPayLoad;

    public DataInfo() {
        super();
    }

    public DataInfo(String tel, long upPayLoad, long downPayLoad) {
        super();
        this.tel = tel;
        this.upPayLoad = upPayLoad;
        this.downPayLoad = downPayLoad;
        this.totalPayLoad = upPayLoad + downPayLoad;
    }

    // ...省略get、set

    /** 注意：filed type、 filed order 依次顺序序列化、和反序列化 */
    @Override
    public void readFields(DataInput in) throws IOException {
        this.tel = in.readUTF();
        this.upPayLoad = in.readLong();
        this.downPayLoad = in.readLong();
        this.totalPayLoad = in.readLong();
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.tel);
        out.writeLong(this.upPayLoad);
        out.writeLong(this.downPayLoad);
        out.writeLong(this.totalPayLoad);
    }
}