自定义GroupCompartor

最新推荐文章于 2022-04-13 20:51:19 发布

R_记忆犹新

最新推荐文章于 2022-04-13 20:51:19 发布

阅读量158

点赞数

分类专栏：大数据 R_记忆犹新的大数据学习之路文章标签： GroupCompartor MapReduce

本文链接：https://blog.csdn.net/qq_28844767/article/details/80488060

版权

大数据同时被 2 个专栏收录

65 篇文章 1 订阅

订阅专栏

R_记忆犹新的大数据学习之路

35 篇文章 1 订阅

订阅专栏

求每个班级中的分数最高的学生的信息Student(course,score,name) :

package mapreduce.exercise.score;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 描述： 求每个班级中的分数最高的学生的信息Student(course,score,name) 
 *
 */

public class CSMR {
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
		System.setProperty("HADOOP_USER_NAME", "hadoop");
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(CSMR.class);
		
		job.setMapperClass(CSMRMapper.class);
		job.setReducerClass(CSMRReducer.class);
		
		job.setMapOutputKeyClass(Student.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		job.setOutputKeyClass(Student.class);
		job.setOutputValueClass(NullWritable.class);
		
		/**
		 * 指定自定义分组规则
		 */
		job.setGroupingComparatorClass(CSGroupComparator.class);
		
		/**
		 * 设置输入输出
		 */
		Path inputPath = new Path(args[0]);
		Path outputPath = new Path(args[1]);
		FileInputFormat.setInputPaths(job, inputPath);
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(outputPath)){
			fs.delete(outputPath,true);
		}
		FileOutputFormat.setOutputPath(job, outputPath);
	
		/**
		 * 提交任务
		 */
		boolean isDone = job.waitForCompletion(true);
		System.exit(isDone ? 0 : 1);
	
	}
	
	/**
	 * Mapper阶段的业务逻辑
	 */
	private static class CSMRMapper extends Mapper<LongWritable, Text, Student, NullWritable>{
		
		private Student student = new Student();
		/**
		 * value:
		 * 	computer,huangxiaoming,85,86,41,75,93,42,85
		 */

		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {
			
			String[] split = value.toString().split(",");
			
			String course = split[0];
			String name = split[1];
			
			double sumScore = 0;
			int countScore = 0;
			for(int i = 2; i < split.length; i++){
				sumScore += Integer.parseInt(split[i]);
				countScore ++;
			}
			double avgScore = sumScore / countScore;
		
			student.setCourse(course);
			student.setName(name);
			student.setScore(avgScore);
			
			context.write(student, NullWritable.get());
		}
	}
	
	/**
	 * Reducer阶段的业务逻辑
	 */
	private static class CSMRReducer extends Reducer<Student, NullWritable, Student, NullWritable>{
		
		/**
		 * reduce方法接收的参数：
		 * 
		 * 	key : student对象
		 * 	一次reduce方法的调用就是传过来一组key相同的key-value
		 * 
		 * 而且按照分数也排了降序
		 * 
		 * 输出第一条数据，就是这一组当中的分数最高的学生的信息
		 * 
		 */
		@Override
		protected void reduce(Student key, Iterable<NullWritable> values,Context context)
				throws IOException, InterruptedException {

			int count = 0;
			int top2 = 2;
			/**
			 * 只输出每一组中的最高分数
			 */
			for(NullWritable nvl : values){
				context.write(key, NullWritable.get());
				count ++;
				if(count == top2){
					break;
				}
			}
			
			/**
			 * 原样输出
			 */
			/*for(NullWritable nvl : values){
				context.write(key, NullWritable.get());
			}*/
			
		}
		
	}	

}

默认情况下：排序规则就是分组规则,自定义分组规则 :

package mapreduce.exercise.score;
/**
 * 描述： WritableComparator抽象类的作用 ： 自定义分组规则 
 *
 */

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class CSGroupComparator  extends WritableComparator{

	
	public CSGroupComparator() {
		super(Student.class, true);
		
	}

	@Override
	public int compare(WritableComparable a, WritableComparable b) {

		System.out.println("*********************");
		Student student_a = (Student)a;
		Student student_b = (Student)b;
		
		return student_a.getCourse().compareTo(student_b.getCourse());

//		return student_a.getName().compareTo(student_b.getName());
		
	}
		
}

普通的用来作为CSMR程序中的key的用户自定义对象:

package mapreduce.exercise.score;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

/**
 *描述 : 普通的用来作为CSMR程序中的key的用户自定义对象
 *
 * 一定要实现WritableComparable接口
 */

public class Student implements WritableComparable<Student> {

	private String name;
	private String course;
	private double score;
	
	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public String getCourse() {
		return course;
	}

	public void setCourse(String course) {
		this.course = course;
	}

	public double getScore() {
		return score;
	}

	public void setScore(double score) {
		this.score = score;
	}

	public Student(String name, String course, double score) {
		super();
		this.name = name;
		this.course = course;
		this.score = score;
	}

	public Student() {
		super();
	}
	
	@Override
	public String toString() {
		return name + "\t" + course + "\t" + score;
	}

	@Override
	public void write(DataOutput out) throws IOException {

		out.writeUTF(name);
		out.writeUTF(course);
		out.writeDouble(score);
	}
	
	@Override
	public void readFields(DataInput in) throws IOException {

		this.name = in.readUTF();
		this.course = in.readUTF();
		this.score = in.readDouble();
		
	}
	
	
	/**
	 * 默认情况下： 排序规则就是分组规则
	 * 
	 * 在当前需求中： 排序规则是： 按照 课程 和分数排序
	 * 
	 * 如果当前的这个compareTo方法如果返回为0，就表示参与比较的两个对象是相同的对象
	 * 
	 * hello,1
	 * hello,1
	 * 
	 * huangbo,math,98
	 * xuzheng,math,98
	 * 
	 * 当前这两个student对象其实是 同一组的两个key
	 * 因为当前的这个排序规则也就是分组规则
	 * 当compareTo 方法返回的值是0的话，就表示参与比较 的这两个对象就是同一组的两个key
	 * 
	 * 也就这么认为： 这两个key对于mapreduce程序来说，其实就是相同的key
	 * 
	 * 那么将来reduceTask在处理的时候，就会认为这两个key相同，就会交给同一个reduce方法进行处理
	 * 
	 * haungbo,math,98
	 * xuzheng,math,98
	 * 
	 * 如果自定义了一个分组规则，那么分组规则中的字段必须是排序规则中的字段的前几个。！！！
	 * 
	 * 排序规则： a b c d 		course score
	 * 分组： 	  a				course
	 * 
	 * 当前排序规则不能使用作为分组规则的时候，就应该要是用 自定义分组
	 * 
	 * 如果排序规则是abc，那么分组规则不是abc，那么就需要自定义分组规则 自定义的分组规则也就只能是ab 或者 a
	 * 
	 */

	@Override
	public int compareTo(Student o) {
		/**
		 * compareTo == 0 表示是 同一门课程
		 */
		int compareTo = o.getCourse().compareTo(this.getCourse());
//		int compareTo = o.getName().compareTo(this.getName());
		
		if(compareTo == 0){
//			int compareTo1 = o.getCourse().compareTo(this.getCourse());
			/**
			 * 当课程相同的时候，再按照分数排序
			 */
			double diff = o.getScore() - this.getScore();
			
			if(diff == 0){
				return 0;
			}else{
				return diff > 0 ? 1 : -1;
			}			
			
		}else{
			return compareTo;
		}		
	}

}

R_记忆犹新

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
自定义GroupCompartor

求每个班级中的分数最高的学生的信息Student(course,score,name) :package mapreduce.exercise.score;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import...
复制链接

扫一扫