数据:
班级|姓名|学科|成绩(存在重复)
ClassA|Jack|Math|95
ClassA|Jack|Math|95
ClassA|Jack|English|95
ClassA|Jack|English|95
ClassA|Tom|Math|85
ClassA|Tom|Math|85
ClassA|Tom|English|85
ClassA|Tom|English|85
ClassA|Jerry|Math|80
ClassA|Jerry|Math|80
ClassA|Jerry|English|80
ClassA|Jerry|English|80
ClassA|Bob|Math|60
ClassA|Bob|Math|60
ClassA|Bob|English|60
ClassA|Bob|English|60
ClassA|Alice|Math|90
ClassA|Alice|Math|90
ClassA|Alice|English|90
ClassA|Alice|English|90
ClassA|Rose|Math|90
ClassA|Rose|Math|90
ClassA|Rose|English|90
ClassA|Rose|English|90
ClassB|Jack|Math|90
ClassB|Jack|Math|90
ClassB|Jack|English|95
ClassB|Jack|English|95
ClassB|Tom|Math|85
ClassB|Tom|Math|85
ClassB|Tom|English|85
ClassB|Tom|English|85
ClassB|Jerry|Math|80
ClassB|Jerry|Math|80
ClassB|Jerry|English|80
ClassB|Jerry|English|80
ClassB|Bob|Math|60
ClassB|Bob|Math|60
ClassB|Bob|English|60
ClassB|Bob|English|60
ClassB|Alice|Math|90
ClassB|Alice|Math|90
ClassB|Alice|English|90
ClassB|Alice|English|90
ClassB|Rose|Math|99
ClassB|Rose|Math|99
ClassB|Rose|English|98
ClassB|Rose|English|98
需求:
将每个班级分别写出到不同的文件,内容是每个学科的前两名学生记录
处理结果1:
ClassB|Rose|English|98
ClassB|Jack|English|95
ClassB|Rose|Math|99
ClassB|Jack|Math|90
处理结果2:
ClassA|Jack|English|95
ClassA|Rose|English|90
ClassA|Jack|Math|95
ClassA|Alice|Math|90
分析:
- 不同的班级写出到不同的文件需要自定义Partitioner来实现自定义分区
- 数据存在重复为了减轻reducer端压力则需要自定义combiner优化器用来去重
- 为了保证相同学科分到一组则需要自定义WritableCompatator用来分组(当然这里是为了用而用本身自定义类就已经实现)
- 最后就是自定义类需要实现序列化和比较功能
代码实现:
package mr;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.StringTokenizer;
/**
* @ClassName: TopNDriver
* @Description:
* @Author: xuezhouyi
* @Version: V1.0
**/
public class TopNDriver {
/* 学生类:必须实现序列化和比较功能 */
public static class Student implements WritableComparable<Student> {
private String stuClass;
private String stuName;
private String stuCourse;
private Integer stuScore;
public String getStuClass() {
return stuClass;
}
public void setStuClass(String stuClass) {
this.stuClass = stuClass;
}
public String getStuName() {
return stuName;
}
public void setStuName(String stuName) {
this.stuName = stuName;
}
public String getStuCourse() {
return stuCourse;
}
public void setStuCourse(String stuCourse) {
this.stuCourse = stuCourse;
}
public Integer getStuScore() {
return stuScore;
}
public void setStuScore(Integer stuScore) {
this.stuScore = stuScore;
}
@Override
public String toString() {
return this.stuClass + "|" + this.stuName + "|" + this.stuCourse + "|" + this.stuScore;
}
@Override
public int compareTo(Student o) {
/* 学科+分数比较,保证相同学科下分数高的降序 */
int compare = this.getStuCourse().compareTo(o.getStuCourse());
if (compare == 0)
return -this.getStuScore().compareTo(o.getStuScore());
return compare;
}
/* 序列化 */
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.stuClass);
out.writeUTF(this.stuName);
out.writeUTF(this.stuCourse);
out.writeInt(this.stuScore);
}
/* 反序列化 */
@Override
public void readFields(DataInput in) throws IOException {
this.stuClass = in.readUTF();
this.stuName = in.readUTF();
this.stuCourse = in.readUTF();
this.stuScore = in.readInt();
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(TopNDriver.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
/* 设置map输出类型 */
job.setMapOutputKeyClass(Student.class);
job.setMapOutputValueClass(NullWritable.class);
/* 设置reduce输出类型 */
job.setOutputKeyClass(Student.class);
job.setOutputValueClass(NullWritable.class);
/* 设置分区:两个分区 */
job.setPartitionerClass(MyPartitioner.class);
job.setNumReduceTasks(2);
/* 设置聚合 */
job.setCombinerClass(MyCombiner.class);
/* 设置分组 */
job.setGroupingComparatorClass(MyGroup.class);
/* 设置输入输出路径 */
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
/* 先清理再输出 */
FileSystem fileSystem = FileSystem.get(conf);
if (fileSystem.exists(new Path(args[1])))
fileSystem.delete(new Path(args[1]), true);
/* 退出 */
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
/* mapTask类 */
private static class MyMapper extends Mapper<LongWritable, Text, Student, NullWritable> {
Student k = new Student();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
/* 数据拆分 */
StringTokenizer tokenizer = new StringTokenizer(value.toString(), "|");
/* 数据清洗 */
if (tokenizer.countTokens() != 4)
return;
/* 数据封装 */
String str = "";
str = (String) tokenizer.nextElement();
k.setStuClass(str);
str = (String) tokenizer.nextElement();
k.setStuName(str);
str = (String) tokenizer.nextElement();
k.setStuCourse(str);
Integer score = Integer.parseInt((String) tokenizer.nextElement());
k.setStuScore(score);
/* 数据发送 */
context.write(k, NullWritable.get());
}
}
/* 分区类:按照班级进行分区 */
private static class MyPartitioner extends Partitioner<Student, NullWritable> {
@Override
public int getPartition(Student student, NullWritable nullWritable, int numPartitions) {
/* 根据班级的哈希值取余进行分区 */
return (student.getStuClass().hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
/* 局部聚合类:实现数据去重 */
private static class MyCombiner extends Reducer<Student, NullWritable, Student, NullWritable> {
Student k = new Student();
@Override
protected void reduce(Student key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
/* 数据封装:需要注意类必须是public的 */
try {
BeanUtils.copyProperties(k, key);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
/* 数据发送 */
context.write(k, NullWritable.get());
}
}
/* 分组类 */
private static class MyGroup extends WritableComparator {
public MyGroup() {
super(Student.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
Student stuA = (Student) a;
Student stuB = (Student) b;
/* 相同学科分到一组 */
return stuA.getStuCourse().compareTo(stuB.getStuCourse());
}
}
/* reduceTask类 */
private static class MyReducer extends Reducer<Student, NullWritable, Student, NullWritable> {
@Override
protected void reduce(Student key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
int i = 0;
for (NullWritable value : values) {
/* 写出到HDFS */
context.write(key, NullWritable.get());
/* 每门课程的前两名 */
if (i++ == 1)
return;
}
}
}
}