mapreduce-二次排序 -夜幕思年华

脚本数据
20 21
50 51
50 52
50 53
50 54
60 51
60 53
60 52
60 56
60 57
70 58
60 61
70 54
70 55
70 56
70 57
70 58
1 2
3 4
5 6
7 82
203 21
50 512
50 522
50 53
530 54
40 511
20 53
20 522
60 56
60 57
740 58
63 61
730 54
71 55
71 56
73 57
74 58
12 211
31 42
50 6
package com.doop.gw.hadoop.reducer.secondOrder;

import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.junit.Test;


public class SecondarySort {

  public static class IntPair implements WritableComparable<IntPair> {
    int first;
    int second;
    
    public void set(int left, int right) {
      first = left;
      second = right;
    }

    public int getFirst() {
      return first;
    }
    
    @Override
    //反序列化,从流中的二进制转换成IntPair
    public void readFields(DataInput in) throws IOException {
      // TODO Auto-generated method stub
      first = in.readInt();
      second = in.readInt();
    }

    @Override
    //序列化,将IntPair转化成使用流传送的二进制
    public void write(DataOutput out) throws IOException {
      // TODO Auto-generated method stub
      out.writeInt(first);
      out.writeInt(second);
    }

    @Override
    //key的比较
    //这个方法是用于二次排序的。

    public int compareTo(IntPair o) {
      // TODO Auto-generated method stub
      if (first != o.first) {
        //如果FIRST不相同,进行对比后排序
        return first < o.first ? -1 : 1;
      } else if (second != o.second) {
        //如果first相等,按second排序,至于是升序还是降序,你可自已根据返回值确定
        return second < o.second ? -1 : 1;
      } else {
        return 0;
      }
    }

    //新定义类应该重写的两个方法
    @Override
    //The hashCode() method is used by the HashPartitioner (the default partitioner in MapReduce)
    public int hashCode() {
      return first * 157 + second;
    }

    @Override
    public boolean equals(Object right) {
      if (right == null) {
        return false;
      }
      if (this == right) {
        return true;
      }
      if (right instanceof IntPair) {
        IntPair r = (IntPair) right;
        return r.first == first && r.second == second;
      } else {
        return false;
      }
    }
  }

  /**
   * 分区函数类。根据first确定Partition。
   */
  public static class FirstPartitioner extends Partitioner<IntPair, IntWritable> {

    @Override
    public int getPartition(IntPair key, IntWritable value, int numPartitions) {
      return Math.abs(key.getFirst() * 127) % numPartitions;
    }
  }
  
  public static class GroupingComparator extends WritableComparator {
    protected GroupingComparator() {
      super(IntPair.class, true);
    }

    @Override
    //Compare two WritableComparables.
    public int compare(WritableComparable w1, WritableComparable w2) {
      IntPair ip1 = (IntPair) w1;
      IntPair ip2 = (IntPair) w2;
      int l = ip1.getFirst();
      int r = ip2.getFirst();
      return l == r ? 0 : (l < r ? -1 : 1);
    }
  }


  // 自定义map
  public static class Map extends Mapper<LongWritable, Text, IntPair, IntWritable> {
    private final IntPair intkey = new IntPair();
    private final IntWritable intvalue = new IntWritable();
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
      String line = value.toString();
      StringTokenizer tokenizer = new StringTokenizer(line);
      int left = 0;
      int right = 0;
      if (tokenizer.hasMoreTokens()) {
        left = Integer.parseInt(tokenizer.nextToken());
        if (tokenizer.hasMoreTokens()) {
          right = Integer.parseInt(tokenizer.nextToken());
        }
        intkey.set(left, right);
        intvalue.set(right);
        context.write(intkey, intvalue);
      }
    }
  }

  // 自定义reduce
  public static class Reduce extends Reducer<IntPair, IntWritable, Text, IntWritable> {
    private final Text left = new Text();
    private static final Text SEPARATOR = new Text(
        "------------------------------------------------");

    public void reduce(IntPair key, Iterable<IntWritable> values, Context context)
        throws IOException, InterruptedException {
      context.write(SEPARATOR, null);
      left.set(Integer.toString(key.getFirst()));
      for (IntWritable val : values) {
        context.write(left, val);
      }
    }
  }

  static final String INPUT_PATH = "hdfs://192.168.33.104:8020/input/secondSort";
  static final String OUT_PATH = "hdfs://192.168.33.104:8020/output";

  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
    // TODO Auto-generated method stub
    // 读取hadoop配置
    Configuration conf = new Configuration();
    FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
    Path outPath = new Path(OUT_PATH);
    if (fileSystem.exists(outPath)) {
      fileSystem.delete(outPath, true);
    }

    // 实例化一道作业
    Job job = new Job(conf, "secondarysort");
    job.setJarByClass(SecondarySort.class);
    // Mapper类型
    job.setMapperClass(Map.class);
    // 不再需要Combiner类型,因为Combiner的输出类型<Text, IntWritable>对Reduce的输入类型<IntPair, IntWritable>不适用
    //job.setCombinerClass(Reduce.class);
    // Reducer类型
    job.setReducerClass(Reduce.class);
    // 分区函数
    job.setPartitionerClass(FirstPartitioner.class);
    // 分组函数
    job.setGroupingComparatorClass(GroupingComparator.class);

    // map 输出Key的类型
    job.setMapOutputKeyClass(IntPair.class);
    // map输出Value的类型
    job.setMapOutputValueClass(IntWritable.class);
    // rduce输出Key的类型,是Text,因为使用的OutputFormatClass是TextOutputFormat
    job.setOutputKeyClass(Text.class);
    // rduce输出Value的类型
    job.setOutputValueClass(IntWritable.class);

    // 将输入的数据集分割成小数据块splites,同时提供一个RecordReder的实现。
    job.setInputFormatClass(TextInputFormat.class);
    // 提供一个RecordWriter的实现,负责数据输出。
    job.setOutputFormatClass(TextOutputFormat.class);
    // 输入hdfs路径
    FileInputFormat.setInputPaths(job, new Path(INPUT_PATH));
    // 输出hdfs路径
    FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
    // 提交job
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }


  /**
   * 读取文件
   */
  @Test
  public void testRead() {
    Configuration conf = new Configuration();
    try {
      // 根据配置文件创建HDFS对象
      conf.set("fs.defaultFS", "hdfs://192.168.33.104:8020"); //远程需要设定
      FileSystem fs = FileSystem.get(conf);
      Path path = new Path("hdfs://192.168.33.104:8020/output/part-r-00000");
      FSDataInputStream fis = fs.open(path);
      ByteArrayOutputStream baos = new ByteArrayOutputStream();
      IOUtils.copyBytes(fis, baos, 1024);
      fis.close();
      System.out.println(new String(baos.toByteArray()));
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值