(二)MapReduce之排序、分区、计数器和Combiner

一、排序

    Text IntWritable等自带比较规则

    hadoop提供了比较器,可以重新定义排序规则

    方法:继承Text、IntWritable类中的Comparator类,并重写compare方法

    将wordcount中的结果按字典顺序倒序,重新排序:

import org.apache.hadoop.io.Text;
/*
重写Text的比较方法,
将结果按字典序倒序
 */
public class ReverseWordComparator extends Text.Comparator {
    @Override
    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
        return -super.compare(b1, s1, l1, b2, s2, l2);
    }
}

    将wordcount中的结果按长度排序,如果长度相同,按字符大小排序:

import org.apache.hadoop.io.Text;

/*
重写Text的比较方法
按字符串的长度排序
如果长度相同,按字符大小排序(比如abc和cba大小相同)
 */
public class LengthComparator extends Text.Comparator{
    @Override
    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
        String str1=new String(b1,s1+1,l1-1);
        String str2=new String(b2,s2+1,l2-1);
        if(str1.length()==str2.length()){
            return getSize(str1)-getSize(str2);
        }else{
            return str1.length()-str2.length();
        }
    }
    public int getSize(String s){
        char[] chars=s.toCharArray();
        int sum=0;
        for(char ch:chars){
            sum+=ch;
        }
        return sum;
    }
}

二、分区

    通过分区可以按照需求将不同的记录保存到不同的结果文件中。

    分区规则可以指定,默认使用hash,基于Key进行的

    按业务逻辑划分时,要注意负载均衡

   方法:编写一个类继承 Partitioner 类并重写 getPartition 方法,再通过job.setNumReduceTasks()和job.setPartitionerClass()设置分区规则和reduce数量

    reduce的数量和分区的数量的关系:

            reduce的数量最好和分区的数量相同

            如果只有一个reduce程序也可以运行,但无意义

            reduce的数量可以大于分区的数量,但有的reduce会分不到数据,产生空白的文件输出

            reduce的数量不可以小于分区的数量,程序会无法执行

    ①相同的手机号段放在同一个文件中:

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/*
将手机号段相同的号码放在同一个文件中
自定义业务逻辑分区时,注意负载均衡的问题
 */
public class PhonePartitioner extends Partitioner<Text,Flow>{
    public int getPartition(Text text, Flow flow, int numPartitions) {
        String phoneNum=text.toString();
        String  firstSecondNum=phoneNum.substring(0,2);
        if("13".equals(firstSecondNum)){
            return 0;
        }else if("15".equals(firstSecondNum)){
            return 1;
        }else if("18".equals(firstSecondNum)){
            return 2;
        }else{
            return 3;
        }
    }
}

    在通过job设置:

job.setNumReduceTasks(4);//分成四个区,对应四个输出文件
job.setPartitionerClass(PhonePartitioner.class);

    ②将wordcount的结果,按首字母分成四个文件输出:

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class WordPartitioner extends Partitioner<Text,IntWritable> {

    public int getPartition(Text text, IntWritable intWritable, int numPartitions) {
        String word = text.toString();
        char first;
        System.err.println(word.length());
        if(word.length()==0){
            first=' ';
        }else {
            first = word.charAt(0);
        }

        if ((first >= 'a' && first < 'g') || (first >= 'A' && first < 'G')) {
            return 0;
        } else if ((first >= 'g' && first < 'o') || (first >= 'G' && first < 'O')) {
            return 1;
        } else if ((first >= 'o' && first < 'u') || (first >= 'O' && first < 'U')) {
            return 2;
        } else {
            return 3;
        }
    }
}

    再通过job设置:

 job.setNumReduceTasks(4);
 job.setPartitionerClass(WordPartitioner.class);

三、计数器

    分为两种:内置的:如下

                    自定义的:

    定义方法:①Counter counter=context.getCounter(groupName,counterName);

                    ②使用枚举 

                          public enum  MC{ 

                               ERROR

                          }

                           Counter error=context.getCounter(MC.ERROR);

    常用方法:counter.increment();

                     job.getCounters().getGroup(groupName).iterator();

四、Combiner

    和reduce的使用方法一样,但是combiner运行的位置和reduce不同,combiner运行在map端,相当于多做了一次reduce,可以提高性能,减少map对reduce的输出。

    另外,因为combiner的输入是map输出,combiner的输出是reduce的输入,所以combiner的输入输出一定是相同的!

    方法:编写一个类继承Reducer类,重写reduce方法,再通过job.setCombinerClass()设置

例:求各月的温度和

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class ForSumTemp {
    public static class ForMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
        private Text oKey=new Text();
        private IntWritable oValue=new IntWritable();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line=value.toString();
            String []strs=line.split(" ");
            oKey.set(strs[0]);
            oValue.set(Integer.parseInt(strs[1]));
            context.write(oKey,oValue);
        }
    }
    public static class ForCombiner extends Reducer<Text,IntWritable,Text,IntWritable>{
        private IntWritable oValue=new IntWritable();
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum=0;
            for(IntWritable i:values){
                sum+=i.get();
            }
            oValue.set(sum);
            context.write(key,oValue);
        }
    }
    public static class ForReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum=0;
            for(IntWritable i:values){
                sum+=i.get();
            }
            context.write(key,new IntWritable(sum));
        }
    }

    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        Job job=Job.getInstance();
        job.setMapperClass(ForMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(ForReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setCombinerClass(ForCombiner.class);

        FileSystem fileSystem=FileSystem.get(new URI("file:E://output"),new Configuration());
        Path path=new Path("E://output");
        if(fileSystem.exists(path)){
            fileSystem.delete(path,true);
        }

        FileInputFormat.addInputPath(job,new Path("E:\\forTestData\\forCombiner"));
        FileOutputFormat.setOutputPath(job,path);

        job.waitForCompletion(true);
    }
}

    可以从上述代码的运行结果看出Combiner的效果:

    使用Combiner:

    不使用Combiner:

    可以看出reduce的输入数据减少了,因为在combiner的时候已经合并过一次了~

例:求各月的平均温度

自定义一个实体类,记录气温总和和天数

import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class SumAndDay implements Writable{
    private double sumTemp;
    private int days;
    public SumAndDay(){

    }
    public SumAndDay(double sumTemp, int days) {
        this.sumTemp = sumTemp;
        this.days = days;
    }

    public double getSumTemp() {
        return sumTemp;
    }

    public void setSumTemp(double sumTemp) {
        this.sumTemp = sumTemp;
    }

    public int getDays() {
        return days;
    }

    public void setDays(int days) {
        this.days = days;
    }

    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeDouble(sumTemp);
        dataOutput.writeInt(days);
    }

    public void readFields(DataInput dataInput) throws IOException {
        sumTemp=dataInput.readDouble();
        days=dataInput.readInt();
    }
}

计算平均温度:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.DecimalFormat;

public class ForAverTemp {
    public static class ForMapper extends Mapper<LongWritable,Text,Text,SumAndDay>{
        private Text oKey=new Text();
        private SumAndDay oValue=new SumAndDay();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line=value.toString();
            String []strs=line.split(" ");
            oKey.set(strs[0]);
            oValue.setDays(1);
            oValue.setSumTemp(Integer.parseInt(strs[1]));
            context.write(oKey,oValue);
        }
    }
    public static class ForCombiner extends Reducer<Text,SumAndDay,Text,SumAndDay>{
        private SumAndDay oValue=new SumAndDay();
        @Override
        protected void reduce(Text key, Iterable<SumAndDay> values, Context context) throws IOException, InterruptedException {
            int day=0;
            int sum=0;
            for(SumAndDay sad:values){
                day+=sad.getDays();
                sum+=sad.getSumTemp();
            }
            oValue.setDays(day);
            oValue.setSumTemp(sum);
            context.write(key,oValue);
        }
    }
    public static class ForReducer extends Reducer<Text,SumAndDay,Text,DoubleWritable>{
        @Override
        protected void reduce(Text key, Iterable<SumAndDay> values, Context context) throws IOException, InterruptedException {
            int day=0;
            int sum=0;
            for(SumAndDay sad:values){
                day+=sad.getDays();
                sum+=sad.getSumTemp();
            }
            //保留两位小数
            DecimalFormat  df = new DecimalFormat("#.00");
            double averTemp=Double.parseDouble(df.format((sum*1.0)/day));
            context.write(key,new DoubleWritable(averTemp));
        }
    }

    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        Job job=Job.getInstance();
        job.setMapperClass(ForMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(SumAndDay.class);

        job.setReducerClass(ForReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);

        job.setCombinerClass(ForCombiner.class);

        FileSystem fileSystem=FileSystem.get(new URI("file:E://output"),new Configuration());
        Path path=new Path("E://output");
        if(fileSystem.exists(path)){
            fileSystem.delete(path,true);
        }

        FileInputFormat.addInputPath(job,new Path("E:\\forTestData\\forCombiner"));
        FileOutputFormat.setOutputPath(job,path);

        job.waitForCompletion(true);
    }
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值