MapReduce二次排序案例，并且在Reduce输出时进行压缩

最新推荐文章于 2022-02-10 21:55:30 发布

Blusedeng11

最新推荐文章于 2022-02-10 21:55:30 发布

阅读量202

点赞数

文章标签： MapReduce 数据压缩二次排序

本文链接：https://blog.csdn.net/BluseDeng11/article/details/92852436

版权

该博客详细介绍了如何在MapReduce中实现二次排序，通过自定义Bean对象封装数据，编写Map和Reduce类，以及驱动程序。在Reduce输出阶段，还进行了数据压缩，提高了存储效率。最终展示了执行结果及耗时情况。

摘要由CSDN通过智能技术生成

一、原始数据

文件secondsort.txt，内容如下：
在这里插入图片描述

二、代码展示

1、自定义Bean对象封装数据

package com.writablesort2;

import lombok.Getter;
import lombok.Setter;
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

@Getter
@Setter
public class FlowBean implements WritableComparable<FlowBean> {

    private String id;
    private double price;

    /**
     * 用于反序列化时无参调用
     */
    public FlowBean() {

    }

    /**
     * 比较器
     * @param o
     * @return
     */
    @Override
    public int compareTo(FlowBean o) {
        //将id转换为int类型进行比较
        int this_id = Integer.parseInt(this.id);
        int o_id = Integer.parseInt(o.id);
        //首先比较id，升序
        if (this_id > o_id) {
            return 1;
        } else if (this_id < o_id) {
            return -1;
        } else {
            //当id相同时，比较price，降序
            return this.price > o.price ? -1 : 1;
        }
    }

    /**
     * 序列化
     * @param dataOutput
     * @throws IOException
     */
    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(this.id);
        dataOutput.writeDouble(this.price);
    }

    /**
     * 反序列化，注意顺序与序列化保持一致
     * @param dataInput
     * @throws IOException
     */
    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.id = dataInput.readUTF();
        this.price = dataInput.readDouble();
    }

    @Override
    public String toString() {
        return this.id + "\t" + this.price;
    }
}

2、Map类

package com.writablesort2;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class SortMap extends Mapper<LongWritable, Text, FlowBean, NullWritable> {

    FlowBean flowBean = new FlowBean();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获取数据
        String line = value.toString();

        //切分
        String[] split = line.split("\t");

        //赋值/数据封装
        flowBean.setId(split[0]);
        flowBean.setPrice(Double.parseDouble(split[2]));

        context.write(flowBean, NullWritable.get());
    }
}

3、Reduce类

package com.writablesort2;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class SortReduce extends Reducer<FlowBean, NullWritable, FlowBean, NullWritable> {
    @Override
    protected void reduce(FlowBean flowBean, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {

        //输出
        context.write(flowBean, NullWritable.get());

    }
}

4、驱动程序

package com.writablesort2;

import com.utils.Driver;
import org.apache.hadoop.io.NullWritable;
import java.io.IOException;

public class SortDriver {
    public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
        args = new String[]{"D:\\Bigdata\\secondsort.txt", "D:\\Bigdata\\SecondSortResult"};
        Driver.run(SortDriver.class, SortMap.class, FlowBean.class, NullWritable.class,
                SortReduce.class, FlowBean.class, NullWritable.class, args[0], args[1]);
    }
}

5、run方法展示

package com.utils;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Driver {
    /**
     * 主类
     * @param driverClass       主类
     * @param mapClass          map类
     * @param mapKeyClass       map类的key类
     * @param mapValueClass     map类的value类
     * @param reduceClass       reduce类
     * @param reduceKeyClass    reduce类的key类
     * @param reduceValueClass  reduce类的value类
     * @param inputPath         文件输入路径
     * @param outputPath        输出文件夹路径
     */
    public static void run(Class<?> driverClass,
                           Class<? extends Mapper> mapClass,
                           Class<?> mapKeyClass,
                           Class<?> mapValueClass,
                           Class<? extends Reducer> reduceClass,
                           Class<?> reduceKeyClass,
                           Class<?> reduceValueClass,
                           String inputPath,
                           String outputPath) throws IOException, ClassNotFoundException, InterruptedException {
        //开始计时
        long beginTime = System.currentTimeMillis();
        //获取配置文件
        Configuration configuration = new Configuration();
        //开启map端输出压缩
        configuration.setBoolean("mapreduce.map.output.compress", true);
        //设置压缩方式
        configuration.setClass("mapreduce.map.output.compress.codec", BZip2Codec.class, CompressionCodec.class);
        //创建Job任务
        Job job = Job.getInstance(configuration);
        //设置主类
        job.setJarByClass(driverClass);
        //设置map类及其<key, value>类
        job.setMapperClass(mapClass);
        job.setMapOutputKeyClass(mapKeyClass);
        job.setMapOutputValueClass(mapValueClass);
        //设置reduce类及其<key, value>类
        job.setReducerClass(reduceClass);
        job.setOutputKeyClass(reduceKeyClass);
        job.setOutputValueClass(reduceValueClass);
        //当输出路径已存在，删除该路径
        Path path = new Path(outputPath);
        FileSystem fileSystem = FileSystem.get(configuration);
        if (fileSystem.exists(path)) {
            fileSystem.delete(path, true);
        }
        //设置文件上传路径
        FileInputFormat.setInputPaths(job, new Path(inputPath));
        //设置文件夹输出路径
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        //开启reduce端输出压缩
        FileOutputFormat.setCompressOutput(job, true);
        //设置压缩方式
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        //提交任务
        job.waitForCompletion(true);
        //结束计时
        long endTime = System.currentTimeMillis();
        //输出耗时
        System.out.println("耗时：" + (endTime-beginTime)/1000 + "秒");
    }
}