案例
student A 06140412 05 08 102 110 106
student B 06140407 02 06 60 98 80
student C 06140404 10 07 98 31 63
student D 06140403 07 10 105 109 107
student E 06140406 03 03 57 87 92
student F 06140408 10 06 102 102 50
student G 06140402 03 07 54 61 64
student H 06140401 05 03 83 76 111
student I 06140409 05 10 70 56 91
student J 06140411 07 09 22 119 112
student K 06140410 02 01 45 65 80
student L 06140405 03 02 79 20 26
一、需求:计算每个学生的总成绩
GradeWritable类
package com.GradeCount.util;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
*/
public class GradeWritable implements Writable {
//语文
private static int chinese;
//数学
private static int math;
//英语
private static int english;
//总成绩
private static int sum;
//无参构造方法
public GradeWritable(){}
//有参构造方法
public GradeWritable(int chinese,int math,int english){
this.chinese = chinese;
this.math = math;
this.english = english;
this.sum = chinese + math + english;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(chinese);
dataOutput.writeInt(math);
dataOutput.writeInt(english);
dataOutput.writeInt(sum);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
chinese = dataInput.readInt();
math = dataInput.readInt();
english = dataInput.readInt();
sum = dataInput.readInt();
}
@Override
public String toString() {
return chinese+"\t"+math+'\t'+english+'\t'+sum;
}
}
GradeMapper类
package com.GradeCount.util;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 计算每个学生的总成绩
*/
public class GradeMapper extends Mapper<LongWritable, Text, Text, GradeWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取数据
String line = value.toString();
//分割数据
String[] field = line.split("\t");
//获取学生名字
String studentName = field[0];
//获取语文成绩
int chinese = Integer.parseInt(field[4]);
//获取数学成绩
int math = Integer.parseInt(field[5]);
//获取英语成绩
int english = Integer.parseInt(field[6]);
//写到reducer端
context.write(new Text(studentName),new GradeWritable(chinese,math,english));
}
}
GradeReducer类
package com.GradeCount.util;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 计算每个学生的总成绩
*/
public class GradeReducer extends Reducer<Text,GradeWritable,Text,GradeWritable> {
@Override
protected void reduce(Text key, Iterable<GradeWritable> values, Context context) throws IOException, InterruptedException {
//方法一
for (GradeWritable grade: values){
//写到Driver端
context.write(key,grade);
}
//方法二
// context.write(key,values.iterator().next());
}
}
GradeDriver类
package com.GradeCount.util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/*
* @author Administrator
* @version
* @task 计算每个学生的总成绩
*/
public class GradeDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//实例化Configuration对象
Configuration conf = new Configuration();
//创建任务
Job job = Job.getInstance(conf);
//指定Jar包的位置
job.setJarByClass(GradeDriver.class);
//关联Mapper类
job.setMapperClass(GradeMapper.class);
//关联Reducer类
job.setReducerClass(GradeReducer.class);
//设置Mapper输出的数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(GradeWritable.class);
//设置Reducer输出的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(GradeWritable.class);
//设置输入路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//提交任务
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
二、优化:多个小文件情况下进行优化
GradeDriver类
//小文件优化
job.setInputFormatClass(CombineTextInputFormat.class);
//设置最大值为15M
CombineTextInputFormat.setMaxInputSplitSize(job,15728640);
//设置最小值为8M
CombineTextInputFormat.setMinInputSplitSize(job,8388608);
三、需求:按学号正序排序,计算每个学生的总成绩
GradeSort类
package com.SortCount.util;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
* @task 按学号排序计算每个学生的总成绩
*/
public class GradeSort implements WritableComparable<GradeSort> {
//学号
private String id;
//语文成绩
private int chinese;
//数学成绩
private int math;
//英语成绩
private int english;
//总成绩
private int sum;
//无参构造方法
public GradeSort(){}
//有参构造方法
public GradeSort(String id,int chinese,int math,int english){
this.id = id;
this.chinese = chinese;
this.math = math;
this.english = english;
this.sum = chinese + math + english;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public int getChinese() {
return chinese;
}
public void setChinese(int chinese) {
this.chinese = chinese;
}
public int getMath() {
return math;
}
public void setMath(int math) {
this.math = math;
}
public int getEnglish() {
return english;
}
public void setEnglish(int english) {
this.english = english;
}
public int getSum() {
return sum;
}
public void setSum(int sum) {
this.sum = sum;
}
@Override
public String toString() {
return id+"\t"+chinese+"\t"+math+"\t"+english+"\t"+sum;
}
/**
* 全局排序,即对key的排序
* 每一个实例的静态变量是引用同一个静态变量
* 静态变量排序是没意义
*/
@Override
public int compareTo(GradeSort object) {
int thisID = Integer.parseInt(this.id);
int objectID = Integer.parseInt(object.id);
if (thisID > objectID){
return 1;
}else if (thisID < objectID){
return -1;
}else {
return 0;
}
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(id);
dataOutput.writeInt(chinese);
dataOutput.writeInt(math);
dataOutput.writeInt(english);
dataOutput.writeInt(sum);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
id = dataInput.readUTF();
chinese = dataInput.readInt();
math = dataInput.readInt();
english = dataInput.readInt();
sum = dataInput.readInt();
}
}
SortMapper类
package com.SortCount.util;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
* @task 按学号排序计算每个学生的总成绩
*/
public class SortMapper extends Mapper<LongWritable, Text,GradeSort, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取数据
String line = value.toString();
//分割数据
String[] field = line.split("\t");
//学生名字
String studentName = field[0];
//学号
String id = field[1];
//语文成绩
int chinese = Integer.parseInt(field[4]);
//数学成绩
int math = Integer.parseInt(field[5]);
//英语成绩
int english = Integer.parseInt(field[6]);
//写到Reducer端
context.write(new GradeSort(id,chinese,math,english),new Text(studentName));
}
}
SortReducer类
package com.SortCount.util;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
* @task 按学号排序计算每个学生的总成绩
*/
public class SortReducer extends Reducer<GradeSort,Text,Text,GradeSort> {
@Override
protected void reduce(GradeSort key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
context.write(values.iterator().next(),key);
}
}
SortDriver类
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
* @task 按学号排序计算每个学生的总成绩
*/
public class SortDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//实例化Configuration对象
Configuration conf = new Configuration();
//创建任务job
Job job = Job.getInstance(conf);
//指定jar包的位置
job.setJarByClass(SortDriver.class);
//关联Mapper类
job.setMapperClass(SortMapper.class);
//关联Reducer类
job.setReducerClass(SortReducer.class);
//设置Mapper输出的数据类型
job.setMapOutputKeyClass(GradeSort.class);
job.setMapOutputValueClass(Text.class);
//设置Reducer输出的数据类型
job.setOutputKeyClass(GradeSort.class);
job.setOutputValueClass(Text.class);
//设置输入路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//执行任务
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
四、需求:按学号正序排序,按考场号分区,计算每个学生的总成绩
GradeSort类
package com.PartitionSort.util;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
* @task 按学号正序排序,并根据考场进行分区,计算每个学生的总成绩
*/
public class GradeSort implements WritableComparable<GradeSort> {
//学号
private String id;
//考场号
private int roomID;
//语文成绩
private int chinese;
//数学成绩
private int math;
//英语成绩
private int english;
//总成绩
private int sum;
//无参构造方法
public GradeSort(){}
//有参构造方法
public GradeSort(String id,int roomID,int chinese,int math,int english){
this.id = id;
this.roomID = roomID;
this.chinese = chinese;
this.math = math;
this.english = english;
this.sum = chinese + math + english;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public int getRoomID() {
return roomID;
}
public void setRoomID(int roomID) {
this.roomID = roomID;
}
public int getChinese() {
return chinese;
}
public void setChinese(int chinese) {
this.chinese = chinese;
}
public int getMath() {
return math;
}
public void setMath(int math) {
this.math = math;
}
public int getEnglish() {
return english;
}
public void setEnglish(int english) {
this.english = english;
}
public int getSum() {
return sum;
}
public void setSum(int sum) {
this.sum = sum;
}
@Override
public int compareTo(GradeSort object) {
return Integer.parseInt(this.id) > Integer.parseInt(object.id)? 1:-1;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(id);
dataOutput.writeInt(roomID);
dataOutput.writeInt(chinese);
dataOutput.writeInt(math);
dataOutput.writeInt(english);
dataOutput.writeInt(sum);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
id = dataInput.readUTF();
roomID = dataInput.readInt();
chinese = dataInput.readInt();
math = dataInput.readInt();
english = dataInput.readInt();
sum = dataInput.readInt();
}
@Override
public String toString() {
return id + "\t" + roomID + "\t" + chinese + "\t" + math + "\t" + english + "\t" + sum;
}
}
RoomIDPartition类
package com.PartitionSort.util;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/*
* @author Administrator
* @version 1.0
* @task 按学号正序排序,并根据考场进行分区,计算每个学生的总成绩
*/
public class RoomIDPartition extends Partitioner<GradeSort, NullWritable> {
@Override
public int getPartition(GradeSort gradeSort, NullWritable nullWritable, int i) {
if (gradeSort.getRoomID() == 2){
return 0;
}else if (gradeSort.getRoomID() == 3){
return 1;
}else if (gradeSort.getRoomID() == 5){
return 2;
}else if (gradeSort.getRoomID() == 7){
return 3;
}else {
return 4;
}
// return (gradeSort.getRoomID() & Integer.MAX_VALUE) % i;
}
}
PartitionMapper类
package com.PartitionSort.util;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
* @task 按学号正序排序,并根据考场进行分区,计算每个学生的总成绩
*/
public class PartitionMapper extends Mapper<LongWritable, Text,GradeSort, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取行数据
String line = value.toString();
//分割数据
String[] field = line.split("\t");
//学号
String id = field[1];
//考场号
int roomID = Integer.valueOf(field[2]);
//语文成绩
int chinese = Integer.valueOf(field[4]);
//数学成绩
int math = Integer.valueOf(field[5]);
//英语成绩
int english = Integer.valueOf(field[6]);
//写到Reducer端
context.write(new GradeSort(id,roomID,chinese,math,english),NullWritable.get());
}
}
PartitionReducer类
package com.PartitionSort.util;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
* @task 按学号正序排序,并根据考场进行分区,计算每个学生的总成绩
*/
public class PartitionReducer extends Reducer<GradeSort, NullWritable,GradeSort,NullWritable> {
@Override
protected void reduce(GradeSort key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
//写到Driver端
context.write(key,values.iterator().next());
}
}
PartitionDriver类
package com.PartitionSort.util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
* @task 按学号正序排序,并根据考场进行分区,计算每个学生的总成绩
*/
public class PartitionDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//实力化Configuration对象
Configuration conf = new Configuration();
//创建任务
Job job = Job.getInstance(conf);
//指定jar包的位置
job.setJarByClass(PartitionDriver.class);
//关联Mapper类
job.setMapperClass(PartitionMapper.class);
//关联Reducer类
job.setReducerClass(PartitionReducer.class);
//设置Mapper输出的数据类型
job.setMapOutputKeyClass(GradeSort.class);
job.setMapOutputValueClass(NullWritable.class);
//设置Reducer输出的数据类型
job.setOutputKeyClass(GradeSort.class);
job.setOutputValueClass(NullWritable.class);
//设置分区
job.setPartitionerClass(RoomIDPartition.class);
//设置ReduceTasks的个数
job.setNumReduceTasks(5);
//设置输入的路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
//设置输出的路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//执行任务
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
五、需求:按考场号正序排序,按总成绩倒序排序,根据考场号进行分区,计算每个考场里成绩最高的学生
RoomIDSort类
package com.SecondSort.util;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
* @task 按考场号正序排序,按总成绩倒序排序,根据考场号进行分区,计算每个考场里成绩最高的学生
*/
public class RoomIDSort implements WritableComparable<RoomIDSort> {
//学号
private String id;
//考场号
private int roomID;
//语文成绩
private int chinese;
//数学成绩
private int math;
//英语成绩
private int english;
//总成绩
private int sum;
//无参构造方法
public RoomIDSort(){}
//有参构造方法
public RoomIDSort(String id, int roomID, int chinese, int math, int english){
this.id = id;
this.roomID = roomID;
this.chinese = chinese;
this.math = math;
this.english = english;
this.sum = chinese + math + english;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public int getRoomID() {
return roomID;
}
public void setRoomID(int roomID) {
this.roomID = roomID;
}
public int getChinese() {
return chinese;
}
public void setChinese(int chinese) {
this.chinese = chinese;
}
public int getMath() {
return math;
}
public void setMath(int math) {
this.math = math;
}
public int getEnglish() {
return english;
}
public void setEnglish(int english) {
this.english = english;
}
public int getSum() {
return sum;
}
public void setSum(int sum) {
this.sum = sum;
}
@Override
public int compareTo(RoomIDSort object) {
if (this.roomID > object.roomID){
return 1;
}else if (this.roomID < object.roomID){
return -1;
}else {
return this.sum > object.sum?-1:1;
}
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(id);
dataOutput.writeInt(roomID);
dataOutput.writeInt(chinese);
dataOutput.writeInt(math);
dataOutput.writeInt(english);
dataOutput.writeInt(sum);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
id = dataInput.readUTF();
roomID = dataInput.readInt();
chinese = dataInput.readInt();
math = dataInput.readInt();
english = dataInput.readInt();
sum = dataInput.readInt();
}
@Override
public String toString() {
return id + "\t" + roomID + "\t" + chinese + "\t" + math + "\t" + english + "\t" + sum;
}
}
RoomIDPartition类
package com.SecondSort.util;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/*
* @author Administrator
* @version 1.0
* @task 按考场号正序排序,按总成绩倒序排序,根据考场号进行分区,计算每个考场里成绩最高的学生
*/
public class RoomIDPartition extends Partitioner<RoomIDSort, NullWritable> {
@Override
public int getPartition(RoomIDSort roomIDSort, NullWritable nullWritable, int i) {
if (roomIDSort.getRoomID() == 2){
return 0;
}else if (roomIDSort.getRoomID() == 3){
return 1;
}else if (roomIDSort.getRoomID() == 5){
return 2;
}else if (roomIDSort.getRoomID() == 7){
return 3;
}else {
return 4;
}
// return (roomIDSort.getRoomID() & Integer.MAX_VALUE) % i;
}
}
SumSort类
package com.SecondSort.util;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/*
* @author Administrator
* @version 1.0
* @task 按考场号正序排序,按总成绩倒序排序,根据考场号进行分区,计算每个考场里成绩最高的学生
*/
public class SumSort extends WritableComparator {
protected SumSort() {
super(RoomIDSort.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
RoomIDSort roomIDSort1 = (RoomIDSort) a;
RoomIDSort roomIDSort2 = (RoomIDSort) b;
if (roomIDSort1.getRoomID() > roomIDSort2.getRoomID()){
return 1;
}else if (roomIDSort1.getRoomID() < roomIDSort2.getRoomID()){
return -1;
}else
return 0;
}
}
SecondMapper类
package com.SecondSort.util;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
* @task 按考场号正序排序,按总成绩倒序排序,根据考场号进行分区,计算每个考场号成绩最高的学生
*/
public class SecondMapper extends Mapper<LongWritable, Text,RoomIDSort, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取行数据
String line = value.toString();
//分割数据
String[] field = line.split("\t");
//学号
String id = field[1];
//考场号
int roomID = Integer.parseInt(field[2]);
//语文成绩
int chinese = Integer.parseInt(field[4]);
//数学成绩
int math = Integer.parseInt(field[5]);
//英语成绩
int english = Integer.parseInt(field[6]);
//写到Reducer端
context.write(new RoomIDSort(id,roomID,chinese,math,english),NullWritable.get());
}
}
SecondReducer类
package com.SecondSort.util;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
* @task 按考场号正序排序,按总成绩倒序排序,根据考场号进行分区,计算每个考场里成绩最高的学生
*/
public class SecondReducer extends Reducer<RoomIDSort, NullWritable,RoomIDSort,NullWritable> {
@Override
protected void reduce(RoomIDSort key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
SecondDriver类
package com.SecondSort.util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/*
* @author Administrator
* @version 1.0
* @task 按考场号正序排序,按总成绩倒序排序,根据考场号进行分区,计算每个考场里成绩最高的学生
*/
public class SecondDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//实力化Configuration对象
Configuration conf = new Configuration();
//创建任务
Job job = Job.getInstance(conf);
//指定jar包的位置
job.setJarByClass(SecondDriver.class);
//关联Mapper类
job.setMapperClass(SecondMapper.class);
//关联Reducer类
job.setReducerClass(SecondReducer.class);
//设置Mapper输出的数据类型
job.setMapOutputKeyClass(RoomIDSort.class);
job.setMapOutputValueClass(NullWritable.class);
//设置Reducer输出的数据类型
job.setOutputKeyClass(RoomIDSort.class);
job.setOutputValueClass(NullWritable.class);
//设置Reducer端的分组
job.setGroupingComparatorClass(SumSort.class);
//设置分区
job.setPartitionerClass(RoomIDPartition.class);
//设置ReduceTasks的个数
job.setNumReduceTasks(5);
//设置输入的路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
//设置输出的路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//执行任务
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}