MapReduce实现二次排序

最新推荐文章于 2020-10-15 22:51:05 发布

Shaw_Best

最新推荐文章于 2020-10-15 22:51:05 发布

阅读量417

点赞数

本文链接：https://blog.csdn.net/West_door/article/details/105337546

版权

大数据专栏收录该内容

3 篇文章 0 订阅

订阅专栏

一、项目整体架构

在这里插入图片描述

二、项目的pom.xml设置

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>org.example</groupId>
  <artifactId>SecondSort</artifactId>
  <version>1.0-SNAPSHOT</version>

  <name>SecondSort</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.7</maven.compiler.source>
    <maven.compiler.target>1.7</maven.compiler.target>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>2.10.0</version>
    </dependency>
  </dependencies>

  <build>
    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
      <plugins>
        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.1.0</version>
        </plugin>
        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
        <plugin>
          <artifactId>maven-site-plugin</artifactId>
          <version>3.7.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-project-info-reports-plugin</artifactId>
          <version>3.0.0</version>
        </plugin>
      </plugins>
    </pluginManagement>
  </build>
</project>

三、各个类的代码

自定义组合key类

package org.example;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class MyKeyPair implements WritableComparable<MyKeyPair>{

    //组合key属性
    private String first;//第一个排序字段
    private int second;//第二个排序字段
    private int getSencomd;

    //实现反序列化对象input的字段
    //从输入流读取数据
    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.first = dataInput.readUTF();
        this.second = dataInput.readInt();
    }

    //实现序列化对象output的字段
    //将数据写入数据流
    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(first);
        dataOutput.writeInt(second);
    }

    //实现比较器
    //将两个对象进行比较，以便进行排序
    @Override
    public int compareTo(MyKeyPair k) {
        //默认升序排序
        //这里实现降序排序
        int res = this.first.compareTo(k.first);
        if (res != 0){ //若第一个字段不相等，则返回
            return res;
        }else {
            //Integer.valueOf() 实现讲String变成int
            return -Integer.valueOf(this.second).compareTo(Integer.valueOf(k.getSencomd));
        }
    }
    //实现字段的get和set方法
    public int getSecond(){
        return second;
    }

    public void setSecond(int second) {
        this.second = second;
    }
    public String getFirst(){
        return this.first;
    }

    public void setFirst(String first) {
        this.first = first;
    }
}

自定义分区类

package org.example;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class MyPartitioner extends Partitioner<MyKeyPair,IntWritable>{

    //实现自定义分区字段
    //myKeyPair 对应key
    //intWritable对应value
    //i对应分区的数量，相当于Reduce数量
    //返回的是分区编号
    @Override
    public int getPartition(MyKeyPair myKeyPair, IntWritable intWritable, int i) {
        //将第一字段作为分区字段
        return (myKeyPair.getFirst().hashCode() & Integer.MAX_VALUE) % i;
    }
}

自定义分组类

package org.example;
import org.apache.hadoop.io.WritableComparator;

public class MyGroupComparator extends WritableComparator {
    protected MyGroupComparator(){
        //指定分组<key,value>对中key的类型，true为创建该类型的实例
        //若不指定类型，将报空值错误
        super(MyKeyPair.class,true);
    }
    //以第一个字段进行分组
    public int compare(MyKeyPair o1,MyKeyPair o2){
        return o1.getFirst().compareTo(o2.getFirst());
    }
}

定义Mapper类

package org.example;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.StringTokenizer;
public class MyMapper extends
Mapper<LongWritable,Text,MyKeyPair,IntWritable>{
    public void map(LongWritable key,Text value,Context context)
    throws IOException,InterruptedException{
        String line = value.toString();
        String[] ss = line.split(",");
        //设置MyKeyPair类型输出Key
        MyKeyPair outKey = new MyKeyPair();
        String first = ss[0];
        String second = ss[1];
        outKey.setFirst(first);
        outKey.setSecond(Integer.valueOf(second));
        //设置Intwrite类型输出value
        IntWritable outValue = new IntWritable();
        outValue.set(Integer.valueOf(second));
        //输出<key,value>对
        context.write(outKey,outValue);
    }
}

定义Reducer类

package org.example;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MyReducer extends
Reducer<MyKeyPair,IntWritable,Text,IntWritable>{
    public void reduce(MyKeyPair key , Iterable<IntWritable> valuse,Context context)
    throws IOException,InterruptedException{
        Text outKey = new Text();
        for (IntWritable value:valuse){
            outKey.set(key.getFirst());
            context.write(outKey,value);
        }
    }
}

定义程序应用主类

package org.example;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class MySecondSort {
    public static void main( String[] args ) throws IOException, ClassNotFoundException, InterruptedException
    {
        //初始化Configuration类
        Configuration conf = new Configuration();
        //设置hdfs访问地址
        //conf.set("fs.default.name","hdfs://192.168.170.133:9000");

        //构建任务对象
        Job myJob = Job.getInstance(conf, MySecondSort.class.getName());
        myJob.setJarByClass(MySecondSort.class);
        //设置mapper处理类
        myJob.setMapperClass(MyMapper.class);
        //设置自定义分区类
        myJob.setPartitionerClass(MyPartitioner.class);
        //设置自定义分组类
        myJob.setGroupingComparatorClass(MyGroupComparator.class);
        //设置reduce处理类
        myJob.setReducerClass(MyReducer.class);

        //设置map任务的输出类型，与重写的map方法输出类型一致
        myJob.setMapOutputKeyClass(MyKeyPair.class);
        myJob.setMapOutputValueClass(IntWritable.class);

        //设置reduce任务输出类型，与重写的reduce方法输出类型一致
        myJob.setOutputKeyClass(Text.class);
        myJob.setOutputValueClass(IntWritable.class);

        //将输入的数据集分割成小数据块splites
        myJob.setInputFormatClass(TextInputFormat.class);
        //提供一个RecordWriter的实现，负责数据输出
        myJob.setOutputFormatClass(TextOutputFormat.class);


        //设置输入路径
        FileInputFormat.addInputPath(myJob,new Path("/wc/input") );
        //设置输出路径
        FileOutputFormat.setOutputPath(myJob, new Path("/output"));
        //提交任务给Hadoop集群
        myJob.waitForCompletion(true);
    }
}