在Hadoop中,排序是MapReduce的灵魂,MapTask和ReduceTask均会对数据按Key排序,这个操作是MR框架的默认行为,不管你的业务逻辑上是否需要这一操作。
MapReduce框架中,用到的排序主要有两种:快速排序和基于堆实现的优先级队列(PriorityQueue)。
map阶段:
从map输出到环形缓冲区的数据会被排序(这是MR框架中改良的快速排序),这个排序涉及partition和key,当缓冲区容量占用80%,会spill数据到磁盘,生成IFile文件,Map结束后,会将IFile文件排序合并成一个大文件(基于堆实现的优先级队列),以供不同的reduce来拉取相应的数据。
reduce阶段:
从Mapper端取回的数据已是部分有序,Reduce Task只需进行一次归并排序即可保证数据整体有序。为了提高效率,Hadoop将sort阶段和reduce阶段并行化,在sort阶段,Reduce Task为内存和磁盘中的文件建立了小顶堆,保存了指向该小顶堆根节点的迭代器,并不断的移动迭代器,以将key相同的数据顺次交给reduce()函数处理,期间移动迭代器的过程实际上就是不断调整小顶堆的过程(建堆→取堆顶元素→重新建堆→取堆顶元素...),这样,sort和reduce可以并行进行。
分组处理
在数据处理中,经常会碰到这样一个场景,对表数据按照某一字段分组,然后找出各自组内最大的几条记录情形。针对这种分组Top N问题,我们利用Hive、MapReduce等多种工具实现一下。
输入数据
数据字段个数不固定:
第一个是课程名称,总共四个课程,computer,math,english,algorithm,
第二个是学生姓名,后面是每次考试的分数
algorithm,liuyifei,75,85,62,48,54,96,15
computer,huangjiaju,85,75,86,85,85
english,liuyifei,76,95,86,74,68,74,48
english,huangdatou,48,58,67,86,15,33,85
algorithm,huanglei,76,95,86,74,68,74,48
algorithm,huangjiaju,85,75,86,85,85,74,86
computer,huangdatou,48,58,67,86,15,33,85
english,zhouqi,85,86,41,75,93,42,85,75,55,47,22
english,huangbo,85,42,96,38,55,47,22
algorithm,liutao,85,75,85,99,66
computer,huangzitao,85,86,41,75,93,42,85
math,wangbaoqiang,85,86,41,75,93,42,85
computer,liujialing,85,41,75,21,85,96,14,74,86
computer,liuyifei,75,85,62,48,54,96,15
computer,liutao,85,75,85,99,66,88,75,91
computer,huanglei,76,95,86,74,68,74,48
english,liujialing,75,85,62,48,54,96,15
math,huanglei,76,95,86,74,68,74,48
math,huangjiaju,85,75,86,85,85,74,86
math,liutao,48,58,67,86,15,33,85
english,huanglei,85,75,85,99,66,88,75,91
math,xuzheng,54,52,86,91,42,85,75
math,huangxiaoming,85,75,85,99,66,88,75,91
math,liujialing,85,86,41,75,93,42,85,75
english,huangxiaoming,85,86,41,75,93,42,85
统计需求:
1、统计每门课程的参考人数和课程平均分2、统计每门课程参考学生的平均分,并且按课程存入不同的结果文件,要求一门课程一个结果文件,并且按平均分从高到低排序,分数保留一位小数
3、求出每门课程参考学生成绩最高的学生的信息:课程,姓名和平均分
import java.io.IOException;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class CourseScoreMR1 {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
job.setJarByClass(CourseScoreMR1.class);
job.setMapperClass(CourseScoreMR1Mapper.class);
job.setReducerClass(CourseScoreMR1Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
Path inputPath = new Path("E:\\bigdata\\cs\\input");
Path outputPath = new Path("E:\\bigdata\\cs\\output_1");
FileInputFormat.setInputPaths(job, inputPath);
if(fs.exists(outputPath)){
fs.delete(outputPath, true);
}
FileOutputFormat.setOutputPath(job, outputPath);
boolean isDone = job.waitForCompletion(true);
System.exit(isDone ? 0 : 1);
}
public static class CourseScoreMR1Mapper extends Mapper<LongWritable, Text, Text, DoubleWritable>{
/**
* 数据的三个字段: course , name, score
*
* value == algorithm,huangzitao,85,86,41,75,93,42,85,75
*
* 输出的key和value:
*
* key : course
*
* value : avgScore
*
* 格式化数值相关的操作的API : NumberFormat
* SimpleDateFormat
*/
Text outKey = new Text();
DoubleWritable outValue = new DoubleWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(",");
String course = split[0];
int sum = 0;
int count = 0;
for(int i = 2; i<split.length; i++){
int tempScore = Integer.parseInt(split[i]);
sum += tempScore;
count++;
}
double avgScore = 1D * sum / count;
outKey.set(course);
outValue.set(avgScore);
context.write(outKey, outValue);
}
}
public static class CourseScoreMR1Reducer extends Reducer<Text, DoubleWritable, Text, Text>{
Text outValue = new Text();
/**
* key : course
*
* values : 98.7 87.6
*/
@Override
protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
double sum = 0;
int count = 0;
for(DoubleWritable dw : values){
sum += dw.get();
count ++;
}
double lastAvgScore = sum / count;
outValue.set(count+"\t" + lastAvgScore);
context.write(key, outValue);
}
}
}
2 import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.ghgj.mr.exercise.pojo.CourseScore;
import com.ghgj.mr.exercise.ptn.CSPartitioner;
public class CourseScoreMR2{
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
job.setJarByClass(CourseScoreMR2.class);
job.setMapperClass(CourseScoreMR2Mapper.class);
// job.setReducerClass(CourseScoreMR2Reducer.class);
job.setMapOutputKeyClass(CourseScore.class);
job.setMapOutputValueClass(NullWritable.class);
// job.setOutputKeyClass(CourseScore.class);
// job.setOutputValueClass(NullWritable.class);
job.setPartitionerClass(CSPartitioner.class);
job.setNumReduceTasks(4);
Path inputPath = new Path("E:\\bigdata\\cs\\input");
Path outputPath = new Path("E:\\bigdata\\cs\\output_2");
FileInputFormat.setInputPaths(job, inputPath);
if(fs.exists(outputPath)){
fs.delete(outputPath, true);
}
FileOutputFormat.setOutputPath(job, outputPath);
boolean isDone = job.waitForCompletion(true);
System.exit(isDone ? 0 : 1);
}
public static class CourseScoreMR2Mapper extends Mapper<LongWritable, Text, CourseScore, NullWritable>{
CourseScore cs = new CourseScore();
/**
* value = math,huangxiaoming,85,75,85,99,66,88,75,91
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(",");
String course = split[0];
String name = split[1];
int sum = 0;
int count = 0;
for(int i = 2; i<split.length; i++){
int tempScore = Integer.parseInt(split[i]);
sum += tempScore;
count++;
}
double avgScore = 1D * sum / count;
cs.setCourse(course);
cs.setName(name);
cs.setScore(avgScore);
context.write(cs, NullWritable.get());
}
}
public static class CourseScoreMR2Reducer extends Reducer<CourseScore, NullWritable, CourseScore, NullWritable>{
@Override
protected void reduce(CourseScore key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
}
}
}
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
import com.ghgj.mr.exercise.pojo.CourseScore;
public class CSPartitioner extends Partitioner<CourseScore,NullWritable>{
/**
*
*/
@Override
public int getPartition(CourseScore key, NullWritable value, int numPartitions) {
String course = key.getCourse();
if(course.equals("math")){
return 0;
}else if(course.equals("english")){
return 1;
}else if(course.equals("computer")){
return 2;
}else{
return 3;
}
}
}
3 import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.ghgj.mr.exercise.gc.CourseScoreGC;
import com.ghgj.mr.exercise.pojo.CourseScore;
public class CourseScoreMR3{
private static final int TOPN = 3;
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
job.setJarByClass(CourseScoreMR3.class);
job.setMapperClass(CourseScoreMR2Mapper.class);
job.setReducerClass(CourseScoreMR2Reducer.class);
job.setMapOutputKeyClass(CourseScore.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(CourseScore.class);
job.setOutputValueClass(NullWritable.class);
// job.setPartitionerClass(CSPartitioner.class);
// job.setNumReduceTasks(4);
// 指定分组规则
job.setGroupingComparatorClass(CourseScoreGC.class);
Path inputPath = new Path("E:\\bigdata\\cs\\input");
Path outputPath = new Path("E:\\bigdata\\cs\\output_3_last");
FileInputFormat.setInputPaths(job, inputPath);
if(fs.exists(outputPath)){
fs.delete(outputPath, true);
}
FileOutputFormat.setOutputPath(job, outputPath);
boolean isDone = job.waitForCompletion(true);
System.exit(isDone ? 0 : 1);
}
public static class CourseScoreMR2Mapper extends Mapper<LongWritable, Text, CourseScore, NullWritable>{
CourseScore cs = new CourseScore();
/**
* value = math,huangxiaoming,85,75,85,99,66,88,75,91
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(",");
String course = split[0];
String name = split[1];
int sum = 0;
int count = 0;
for(int i = 2; i<split.length; i++){
int tempScore = Integer.parseInt(split[i]);
sum += tempScore;
count++;
}
double avgScore = 1D * sum / count;
cs.setCourse(course);
cs.setName(name);
cs.setScore(avgScore);
context.write(cs, NullWritable.get());
}
}
public static class CourseScoreMR2Reducer extends Reducer<CourseScore, NullWritable, CourseScore, NullWritable>{
int count = 0;
/**
* reducer阶段的reduce方法的调用参数:key相同的额一组key-vlaue
*
* redcuer阶段,每次遇到一个不同的key的key_value组, 那么reduce方法就会被调用一次。
*
*
* values这个迭代器只能迭代一次。
* values迭代器在迭代的过程中迭代出来的value会变,同时,这个value所对应的key也会跟着变 合理
*
*/
@Override
protected void reduce(CourseScore key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for(NullWritable nvl : values){
System.out.println("*********************************** " + (++count) + " " + key.toString());
if(count == 3){
return;
}
}
// 原样输出
/*for(NullWritable nvl : values){
context.write(key, nvl);
}*/
// 输出每门课程的最高分数 , 预期结果中,key的显示都是一样的
// for(NullWritable nvl : values){
// System.out.println(key + " - " nvl);
//
// valueList.add(nvl);
// }
// List<Value> valueList = null;
// 预期结果中,key的显示都是一样的
/*int count = 0;
for(NullWritable nvl : values){
count++;
}
for(int i = 0; i<count; i++){
valueList.get(i) = value
System.out.println(key + " - "+ value);
}*/
// math hello 1
// math hi 2
}
}
}
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import com.ghgj.mr.exercise.pojo.CourseScore;
/**
* 分组规则的指定
*/
public class CourseScoreGC extends WritableComparator{
public CourseScoreGC(){
super(CourseScore.class, true);
}
/**
*
* 方法的定义解释:
*
* 方法的意义:一般来说,都可以从方法名找到一些提示
* 方法的参数:将来你的MR程序中,要作为key的两个对象,是否是相同的对象
* 方法的返回值: 返回值类型为int 当返回值为0的时候。证明, 两个参数对象,经过比较之后,是同一个对象
*
* 在我们的需求中: 分组规则是 Course
*
*/
@Override
public int compare(WritableComparable a, WritableComparable b) {
CourseScore cs1 = (CourseScore)a;
CourseScore cs2 = (CourseScore)b;
int compareTo = cs1.getCourse().compareTo(cs2.getCourse());
return compareTo;
}
}