求每个班级中的分数最高的学生的信息Student(course,score,name) :
package mapreduce.exercise.score;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 描述: 求每个班级中的分数最高的学生的信息Student(course,score,name)
*
*/
public class CSMR {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
System.setProperty("HADOOP_USER_NAME", "hadoop");
Job job = Job.getInstance(conf);
job.setJarByClass(CSMR.class);
job.setMapperClass(CSMRMapper.class);
job.setReducerClass(CSMRReducer.class);
job.setMapOutputKeyClass(Student.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Student.class);
job.setOutputValueClass(NullWritable.class);
/**
* 指定自定义分组规则
*/
job.setGroupingComparatorClass(CSGroupComparator.class);
/**
* 设置输入输出
*/
Path inputPath = new Path(args[0]);
Path outputPath = new Path(args[1]);
FileInputFormat.setInputPaths(job, inputPath);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outputPath)){
fs.delete(outputPath,true);
}
FileOutputFormat.setOutputPath(job, outputPath);
/**
* 提交任务
*/
boolean isDone = job.waitForCompletion(true);
System.exit(isDone ? 0 : 1);
}
/**
* Mapper阶段的业务逻辑
*/
private static class CSMRMapper extends Mapper<LongWritable, Text, Student, NullWritable>{
private Student student = new Student();
/**
* value:
* computer,huangxiaoming,85,86,41,75,93,42,85
*/
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String[] split = value.toString().split(",");
String course = split[0];
String name = split[1];
double sumScore = 0;
int countScore = 0;
for(int i = 2; i < split.length; i++){
sumScore += Integer.parseInt(split[i]);
countScore ++;
}
double avgScore = sumScore / countScore;
student.setCourse(course);
student.setName(name);
student.setScore(avgScore);
context.write(student, NullWritable.get());
}
}
/**
* Reducer阶段的业务逻辑
*/
private static class CSMRReducer extends Reducer<Student, NullWritable, Student, NullWritable>{
/**
* reduce方法接收的参数:
*
* key : student对象
* 一次reduce方法的调用就是传过来一组key相同的key-value
*
* 而且按照分数也排了降序
*
* 输出第一条数据,就是这一组当中的分数最高的学生的信息
*
*/
@Override
protected void reduce(Student key, Iterable<NullWritable> values,Context context)
throws IOException, InterruptedException {
int count = 0;
int top2 = 2;
/**
* 只输出每一组中的最高分数
*/
for(NullWritable nvl : values){
context.write(key, NullWritable.get());
count ++;
if(count == top2){
break;
}
}
/**
* 原样输出
*/
/*for(NullWritable nvl : values){
context.write(key, NullWritable.get());
}*/
}
}
}
默认情况下: 排序规则就是分组规则,自定义分组规则 :
package mapreduce.exercise.score;
/**
* 描述: WritableComparator抽象类的作用 : 自定义分组规则
*
*/
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class CSGroupComparator extends WritableComparator{
public CSGroupComparator() {
super(Student.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
System.out.println("*********************");
Student student_a = (Student)a;
Student student_b = (Student)b;
return student_a.getCourse().compareTo(student_b.getCourse());
// return student_a.getName().compareTo(student_b.getName());
}
}
普通的用来作为CSMR程序中的key的用户自定义对象:
package mapreduce.exercise.score;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
/**
*描述 : 普通的用来作为CSMR程序中的key的用户自定义对象
*
* 一定要实现WritableComparable接口
*/
public class Student implements WritableComparable<Student> {
private String name;
private String course;
private double score;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCourse() {
return course;
}
public void setCourse(String course) {
this.course = course;
}
public double getScore() {
return score;
}
public void setScore(double score) {
this.score = score;
}
public Student(String name, String course, double score) {
super();
this.name = name;
this.course = course;
this.score = score;
}
public Student() {
super();
}
@Override
public String toString() {
return name + "\t" + course + "\t" + score;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
out.writeUTF(course);
out.writeDouble(score);
}
@Override
public void readFields(DataInput in) throws IOException {
this.name = in.readUTF();
this.course = in.readUTF();
this.score = in.readDouble();
}
/**
* 默认情况下: 排序规则就是分组规则
*
* 在当前需求中: 排序规则是: 按照 课程 和分数排序
*
* 如果当前的这个compareTo方法如果返回为0,就表示参与比较的两个对象是相同的对象
*
* hello,1
* hello,1
*
* huangbo,math,98
* xuzheng,math,98
*
* 当前这两个student对象其实是 同一组的两个key
* 因为当前的这个排序规则也就是分组规则
* 当compareTo 方法返回的值是0的话,就表示参与比较 的这两个对象就是同一组的两个key
*
* 也就这么认为: 这两个key对于mapreduce程序来说,其实就是相同的key
*
* 那么将来reduceTask在处理的时候,就会认为这两个key相同,就会交给同一个reduce方法进行处理
*
* haungbo,math,98
* xuzheng,math,98
*
* 如果自定义了一个分组规则,那么分组规则中的字段必须是排序规则中的字段的前几个。!!!
*
* 排序规则: a b c d course score
* 分组: a course
*
* 当前排序规则不能使用作为分组规则的时候,就应该要是用 自定义分组
*
* 如果排序规则是abc,那么分组规则不是abc,那么就需要自定义分组规则 自定义的分组规则也就只能是ab 或者 a
*
*/
@Override
public int compareTo(Student o) {
/**
* compareTo == 0 表示是 同一门课程
*/
int compareTo = o.getCourse().compareTo(this.getCourse());
// int compareTo = o.getName().compareTo(this.getName());
if(compareTo == 0){
// int compareTo1 = o.getCourse().compareTo(this.getCourse());
/**
* 当课程相同的时候,再按照分数排序
*/
double diff = o.getScore() - this.getScore();
if(diff == 0){
return 0;
}else{
return diff > 0 ? 1 : -1;
}
}else{
return compareTo;
}
}
}