使用MapReduce将JSON数据进行分类


数据
在这里插入图片描述


需求:将数据按照性别和文理科进行分类存储
因为要将json字符串转换成对象,所以需要fastjson.jar包


自定义对象

自定义一个对象来存储数据

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Students implements WritableComparable<Students> {
    //分别定义id、姓名、年龄、性别、班级属性
    private long id;
    private String name;
    private int age;
    private String sex;
    private String className;

    //按照id排序
    public int compareTo(Students o) {
        return (int) (o.id - this.id);
    }

    //进行序列化操作
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeLong(id);
        dataOutput.writeUTF(name);
        dataOutput.writeInt(age);
        dataOutput.writeUTF(sex);
        dataOutput.writeUTF(className);
    }

    
    //进行反序列化操作
    public void readFields(DataInput dataInput) throws IOException {
        id = dataInput.readLong();
        name = dataInput.readUTF();
        age = dataInput.readInt();
        sex = dataInput.readUTF();
        className = dataInput.readUTF();
    }
    
    //自定义set方法,为对象进行赋值
    public void set(long id, String name, int age, String sex, String className) {
        this.id = id;
        this.name = name;
        this.age = age;
        this.sex = sex;
        this.className = className;
    }

    @Override
    public String toString() {
        return id + "," + name + "," + age + "," + sex + "," + className;
    }

    public long getId() {
        return id;
    }

    public void setId(long id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }

    public String getSex() {
        return sex;
    }

    public void setSex(String sex) {
        this.sex = sex;
    }

    public String getClassName() {
        return className;
    }

    public void setClassName(String claaaName) {
        this.className = claaaName;
    }
}

自定义的对象类要实现WritableComparable,然后重写compareTo方法,和序列化属性


Mapper阶段

import com.alibaba.fastjson.JSON;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class MapTest extends Mapper<LongWritable, Text, Students, NullWritable> {
    Students k = new Students();
    Students students = new Students();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //将json字符串转换成students对象
        students = JSON.parseObject(value.toString(), Students.class);
        //为k赋值
        k.set(students.getId(),students.getClassName(),students.getAge(),students.getSex(),students.getClassName());
        //写出
        context.write(k, NullWritable.get());
    }
}

可以使用fastjson包里的parseObject将json字符串转换成对象,但是如果想要在Mapper里面进行转换的话,json字符串必须是一行一行的


自定义分区

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class Partition extends Partitioner<Students, NullWritable> {
    public int getPartition(Students students, NullWritable nullWritable, int numPartitions) {
        if ("男".equals(students.getSex())) {
            if (students.getClassName().startsWith("文")) {
                return 0;
            }
            return 1;
        } else {
            if (students.getClassName().startsWith("理")) {
                return 2;
            }
            return 3;
        }
    }
}

想要实现自定义分区必须要继承Partitioner,然后实现getPartition
因为分区是在shuffle阶段进行的,shuffle在mapper和reduce阶段之间,所以自定义分区的泛型是和mapper阶段的输出时一样的
分区数是从0开始计数的,也就是说分区0就是第一个分区


Reduce阶段

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class RedTest extends Reducer<Students, NullWritable, Students, NullWritable> {

    @Override
    protected void reduce(Students key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        for (NullWritable value:values){
            context.write(key,NullWritable.get());
        }
    }
}

这一个部分不需要其他操作,输出所有的数据即可


Driver阶段

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.File;

public class DriTest {
    public static void main(String[] args) throws Exception {
        File file = new File("D:\\MP\\students\\output");
        if (file.exists()) {
            delFile(file);
            driver();
        } else {
            driver();
        }
    }

    public static void delFile(File file) {
        File[] files = file.listFiles();
        if (files != null && files.length != 0) {
            for (int i = 0; i < files.length; i++) {
                delFile(files[i]);
            }
        }
        file.delete();
    }

    public static void driver() throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setMapperClass(MapTest.class);
        job.setJarByClass(DriTest.class);
        job.setReducerClass(RedTest.class);

        job.setMapOutputKeyClass(Students.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(Students.class);
        job.setOutputValueClass(NullWritable.class);

        job.setPartitionerClass(Partition.class);
        job.setNumReduceTasks(4);
        // job.setOutputFormatClass(MyOutputFormat.class);

        FileInputFormat.setInputPaths(job, "D:\\MP\\students\\input\\students.json");
        FileOutputFormat.setOutputPath(job, new Path("D:\\MP\\students\\output"));
        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}

输出目录的那个文件夹必须时不存在的,否则会报错
reduceTask的数量要和分区数一致


结果图
在这里插入图片描述
在这里插入图片描述
已经按照性别和年分类存储了


进阶


上面将数据进行了分类存储,但是但从存储的文件名字却看不出里面存储的数据类型,所以我们可以可以自定义输出文件的名字?


自定义OutputFormat

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;


public class MyOutputFormat extends FileOutputFormat<Students, NullWritable> {
    public RecordWriter<Students, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException {
        return new MyRecordWriter(job);
    }
}

自定义outputFormat要继承OutputFormat,这里我们选择继承FileOutputFormat


自定义RecordWriter

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;

public class MyRecordWriter extends RecordWriter<Students, NullWritable> {
    //定义四个输出流
    FSDataOutputStream stream1;
    FSDataOutputStream stream2;
    FSDataOutputStream stream3;
    FSDataOutputStream stream4;

    public MyRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException {
        //编写构造方法,接收TaskAttemptContext,它存储着一些文件信息
        FileSystem fileSystem = FileSystem.get(taskAttemptContext.getConfiguration());
        //创造四个输出流地址
        stream1 = fileSystem.create(new Path("D:\\MP\\students\\output\\理科_男.txt"));
        stream2 = fileSystem.create(new Path("D:\\MP\\students\\output\\理科_女.txt"));
        stream3 = fileSystem.create(new Path("D:\\MP\\students\\output\\文科_男.txt"));
        stream4 = fileSystem.create(new Path("D:\\MP\\students\\output\\文科_女.txt"));
    }

    /**
     * 逻辑语句   什么数据输出在什么流里面
     * @param key
     * @param value
     * @throws IOException
     * @throws InterruptedException
     */
    public void write(Students key, NullWritable value) throws IOException, InterruptedException {
        if ("男".equals(key.getSex())) {
            if (key.getClassName().startsWith("理")) {
                stream1.write(key.toString().getBytes());
                stream1.write("\n".getBytes());
            } else {
                stream3.write(key.toString().getBytes());
                stream3.write("\n".getBytes());
            }
        } else {
            if (key.getClassName().startsWith("理")) {
                stream2.write(key.toString().getBytes());
                stream2.write("\n".getBytes());
            } else {
                stream4.write(key.toString().getBytes());
                stream4.write("\n".getBytes());
            }
        }
    }

    /**
     * 关闭流
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
        stream1.close();
        stream2.close();
        stream3.close();
        stream4.close();
    }
}

修改Driver

在这里插入图片描述
将自定义的分区先注释掉,选用自定义OutputFormat


  • 结果
    在这里插入图片描述
    输出文件的名字已经变成自己自定义的名字
    在这里插入图片描述
    数据也是分类好的
  • 3
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值