使用MapReduce将JSON数据进行分类
数据
需求:将数据按照性别和文理科进行分类存储
因为要将json字符串转换成对象,所以需要fastjson.jar包
自定义对象
自定义一个对象来存储数据
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Students implements WritableComparable<Students> {
//分别定义id、姓名、年龄、性别、班级属性
private long id;
private String name;
private int age;
private String sex;
private String className;
//按照id排序
public int compareTo(Students o) {
return (int) (o.id - this.id);
}
//进行序列化操作
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(id);
dataOutput.writeUTF(name);
dataOutput.writeInt(age);
dataOutput.writeUTF(sex);
dataOutput.writeUTF(className);
}
//进行反序列化操作
public void readFields(DataInput dataInput) throws IOException {
id = dataInput.readLong();
name = dataInput.readUTF();
age = dataInput.readInt();
sex = dataInput.readUTF();
className = dataInput.readUTF();
}
//自定义set方法,为对象进行赋值
public void set(long id, String name, int age, String sex, String className) {
this.id = id;
this.name = name;
this.age = age;
this.sex = sex;
this.className = className;
}
@Override
public String toString() {
return id + "," + name + "," + age + "," + sex + "," + className;
}
public long getId() {
return id;
}
public void setId(long id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
public String getSex() {
return sex;
}
public void setSex(String sex) {
this.sex = sex;
}
public String getClassName() {
return className;
}
public void setClassName(String claaaName) {
this.className = claaaName;
}
}
自定义的对象类要实现WritableComparable,然后重写compareTo方法,和序列化属性
Mapper阶段
import com.alibaba.fastjson.JSON;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class MapTest extends Mapper<LongWritable, Text, Students, NullWritable> {
Students k = new Students();
Students students = new Students();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将json字符串转换成students对象
students = JSON.parseObject(value.toString(), Students.class);
//为k赋值
k.set(students.getId(),students.getClassName(),students.getAge(),students.getSex(),students.getClassName());
//写出
context.write(k, NullWritable.get());
}
}
可以使用fastjson包里的parseObject将json字符串转换成对象,但是如果想要在Mapper里面进行转换的话,json字符串必须是一行一行的
自定义分区
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class Partition extends Partitioner<Students, NullWritable> {
public int getPartition(Students students, NullWritable nullWritable, int numPartitions) {
if ("男".equals(students.getSex())) {
if (students.getClassName().startsWith("文")) {
return 0;
}
return 1;
} else {
if (students.getClassName().startsWith("理")) {
return 2;
}
return 3;
}
}
}
想要实现自定义分区必须要继承Partitioner,然后实现getPartition
因为分区是在shuffle阶段进行的,shuffle在mapper和reduce阶段之间,所以自定义分区的泛型是和mapper阶段的输出时一样的
分区数是从0开始计数的,也就是说分区0就是第一个分区
Reduce阶段
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class RedTest extends Reducer<Students, NullWritable, Students, NullWritable> {
@Override
protected void reduce(Students key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
for (NullWritable value:values){
context.write(key,NullWritable.get());
}
}
}
这一个部分不需要其他操作,输出所有的数据即可
Driver阶段
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.File;
public class DriTest {
public static void main(String[] args) throws Exception {
File file = new File("D:\\MP\\students\\output");
if (file.exists()) {
delFile(file);
driver();
} else {
driver();
}
}
public static void delFile(File file) {
File[] files = file.listFiles();
if (files != null && files.length != 0) {
for (int i = 0; i < files.length; i++) {
delFile(files[i]);
}
}
file.delete();
}
public static void driver() throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setMapperClass(MapTest.class);
job.setJarByClass(DriTest.class);
job.setReducerClass(RedTest.class);
job.setMapOutputKeyClass(Students.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Students.class);
job.setOutputValueClass(NullWritable.class);
job.setPartitionerClass(Partition.class);
job.setNumReduceTasks(4);
// job.setOutputFormatClass(MyOutputFormat.class);
FileInputFormat.setInputPaths(job, "D:\\MP\\students\\input\\students.json");
FileOutputFormat.setOutputPath(job, new Path("D:\\MP\\students\\output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
输出目录的那个文件夹必须时不存在的,否则会报错
reduceTask的数量要和分区数一致
结果图
已经按照性别和年分类存储了
进阶
上面将数据进行了分类存储,但是但从存储的文件名字却看不出里面存储的数据类型,所以我们可以可以自定义输出文件的名字?
自定义OutputFormat
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyOutputFormat extends FileOutputFormat<Students, NullWritable> {
public RecordWriter<Students, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException {
return new MyRecordWriter(job);
}
}
自定义outputFormat要继承OutputFormat,这里我们选择继承FileOutputFormat
自定义RecordWriter
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
public class MyRecordWriter extends RecordWriter<Students, NullWritable> {
//定义四个输出流
FSDataOutputStream stream1;
FSDataOutputStream stream2;
FSDataOutputStream stream3;
FSDataOutputStream stream4;
public MyRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException {
//编写构造方法,接收TaskAttemptContext,它存储着一些文件信息
FileSystem fileSystem = FileSystem.get(taskAttemptContext.getConfiguration());
//创造四个输出流地址
stream1 = fileSystem.create(new Path("D:\\MP\\students\\output\\理科_男.txt"));
stream2 = fileSystem.create(new Path("D:\\MP\\students\\output\\理科_女.txt"));
stream3 = fileSystem.create(new Path("D:\\MP\\students\\output\\文科_男.txt"));
stream4 = fileSystem.create(new Path("D:\\MP\\students\\output\\文科_女.txt"));
}
/**
* 逻辑语句 什么数据输出在什么流里面
* @param key
* @param value
* @throws IOException
* @throws InterruptedException
*/
public void write(Students key, NullWritable value) throws IOException, InterruptedException {
if ("男".equals(key.getSex())) {
if (key.getClassName().startsWith("理")) {
stream1.write(key.toString().getBytes());
stream1.write("\n".getBytes());
} else {
stream3.write(key.toString().getBytes());
stream3.write("\n".getBytes());
}
} else {
if (key.getClassName().startsWith("理")) {
stream2.write(key.toString().getBytes());
stream2.write("\n".getBytes());
} else {
stream4.write(key.toString().getBytes());
stream4.write("\n".getBytes());
}
}
}
/**
* 关闭流
* @param context
* @throws IOException
* @throws InterruptedException
*/
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
stream1.close();
stream2.close();
stream3.close();
stream4.close();
}
}
修改Driver
将自定义的分区先注释掉,选用自定义OutputFormat
- 结果
输出文件的名字已经变成自己自定义的名字
数据也是分类好的