1、准备数据
科目,姓名,成绩
computer,huangxiaoming,85
computer,xuzheng,54
computer,huangbo,86
computer,liutao,85
computer,huanglei,99
computer,huangxiaoming,85
computer,xuzheng,54
computer,huangbo,86
computer,liujialing,85
computer,liuyifei,75
computer,huangdatou,48
computer,huangjiaju,88
computer,huangzitao,85
english,zhaobenshan,57
english,liuyifei,85
english,liuyifei,76
english,huangdatou,48
english,zhouqi,85
english,huangbo,85
english,huangxiaoming,96
english,huanglei,85
english,liujialing,75
algorithm,liuyifei,75
english,huangxiaoming,96
english,huanglei,85
english,liujialing,75
algorithm,liuyifei,75
algorithm,huanglei,76
algorithm,huangjiaju,85
algorithm,liutao,85
algorithm,huangzitao,81
math,wangbaoqiang,85
algorithm,42
algorithm,huangzitao,81
math,wangbaoqiang,85
math,huanglei,76
math,huangjiaju,85
math,liutao,48
math,huangjiaju,85
math,xuzheng,54
math,huangxiaoming,85
math,liujialing,85
math,huanglei,76
math,huangjiaju,85
math,liutao,48
以上所有的是数据,该数据每行有三个字段值,分别是course,name,score
2、需求分析1
(1)对以上的数据去重 分组
相同的数据 只取一个
思路1:
map端一行读一次
全部发给reduce
key:写死 “”
value:一整行内容
reduce端去重
将value进行放在set集合中
思路2:
运用mr 分组
相同的key分到一组
map端:
key:整条数据
value:Text | NullWritable
reduce端:
相同整条数据 分到一组
reduce(一组中的一个key,values,context){
key输出
}
3、实现1
package com.zc.hadoop.mapreduce.demo.studentscore;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/*
* 1、去重
*/
public class DuplicateRemoval {
/*
* Mapper
*/
static class DuplicateRemovalMapper extends Mapper<LongWritable, Text, Text, Text>{
Text mk = new Text();
Text mv = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
mk.set(value);
mv.set("");
context.write(mk, mv);
}
}
/*
* Reducer
*/
static class DuplicateRemovalReducer extends Reducer<Text, Text, Text, Text>{
Text rv = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
rv.set("");
context.write(key, rv);
}
}
/*
* Driver
*/
public static void main(String[] args) throws Exception {
// 加载配置文件
Configuration conf = new Configuration();
// 启动一个 job
Job job = Job.getInstance(conf);
// 设置主类入口
job.setJarByClass(DuplicateRemoval.class);
// 指定 MR 中 mapper 和 reducer 类
job.setMapperClass(DuplicateRemovalMapper.class);
job.setReducerClass(DuplicateRemovalReducer.class);
// 指定 mapper 和 reducer 输出泛型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 获取 HDFS 文件系统连接
FileSystem fs = FileSystem.get(conf);
// 指定测试文件路径
Path inpath = new Path(args[0]);
if (fs.exists(inpath)) {
FileInputFormat.addInputPath(job, inpath);
// 指定结果输出路径
Path outpath = new Path(args[1]);
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
} else {
System.out.println("指定输入文件路径不存在");
}
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
3、需求分析2
每一个course的最高分,最低分,平均分 作业****
map key 分组
分组: 每一个 course
map端:
key:course Text
value:分数 IntWritable
reduce端:
相同course的所有数据分到一组
reduce(科目,所有分数,context){
循环遍历所有分数 求最大 最小 平均
}
输出
key: 科目 Text
value:最大值+最小值+平均值 Text
4、实现2
package com.zc.hadoop.mapreduce.demo.studentscore;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.zc.hadoop.mapreduce.demo.studentscore.DuplicateRemoval.DuplicateRemovalMapper;
import com.zc.hadoop.mapreduce.demo.studentscore.DuplicateRemoval.DuplicateRemovalReducer;
/*
* 2、每一个course的最高分,最低分,平均分
*/
public class CourseMaxMinAndAvgScore {
/*
* mapper
*/
static class CourseMaxMinAndAvgScoreMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
Text mk = new Text();
IntWritable mv = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 拿到每一行数据进行分割
String[] info = value.toString().split(",");
// 去除不规则数据
if (info.length == 3) {
// [course,name,score]
// key --> course, value ---> score
mk.set(info[0].trim());
mv.set(Integer.parseInt(info[2].trim()));
context.write(mk, mv);
}
}
}
/*
* reducer
*/
static class CourseMaxMinAndAvgScoreReducer extends Reducer<Text, IntWritable, Text, Text>{
Text rk = new Text();
Text rv = new Text();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context)
throws IOException, InterruptedException {
// 按课程分组,成绩为迭代器
int max = 0, min = 0, count = 0, sum = 0;
for (IntWritable value : values) {
int score = value.get();
count++;
if (count == 1) {
max = min = score;
}
// 求最大值
if (max < score) {
max = score;
}
// 求最小值
if (min > score) {
min = score;
}
// 求和
sum += score;
}
// 求平均分
double avg = 1.0 * sum / count;
rk.set(key);
rv.set("max=" + max + "; min=" + min + "; avg=" + avg);
context.write(rk, rv);
}
}
public static void main(String[] args) throws Exception {
// 加载配置文件
Configuration conf = new Configuration();
// 启动一个 job
Job job = Job.getInstance(conf);
// 设置主类入口
job.setJarByClass(CourseMaxMinAndAvgScore.class);
// 指定 MR 中 mapper 和 reducer 类
job.setMapperClass(CourseMaxMinAndAvgScoreMapper.class);
job.setReducerClass(CourseMaxMinAndAvgScoreReducer.class);
// 指定 mapper 和 reducer 输出泛型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 获取 HDFS 文件系统连接
FileSystem fs = FileSystem.get(conf);
// 指定测试文件路径
Path inpath = new Path(args[0]);
if (fs.exists(inpath)) {
FileInputFormat.addInputPath(job, inpath);
// 指定结果输出路径
Path outpath = new Path(args[1]);
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
} else {
System.out.println("指定输入文件路径不存在");
}
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
5、需求分析3
求该成绩表当中出现相同分数的分数,还有次数,以及该分数的人数
求相同科目相同分数的有多少个人 并且都是谁
返回结果的格式:
科目 分数 次数 该分数的人
例子:
computer 85 3 huangzitao,liujialing,huangxiaoming
分组: 科目 分数
map端:
key:科目+分数 Text
value:姓名 Text
reduce端:
接受的相同科目和分数的所有人的姓名
reduce(科目+分数,所有的姓名,context){
计数迭代器
迭代器姓名拼接
}
key:Text
value:人数+拼接人名 Text
6、实现3
package com.zc.hadoop.mapreduce.demo.studentscore;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/*
* 3、求相同科目相同分数的有多少个人 并且都是谁
*/
public class IdenticalCourseAndScorePeopleCount {
/*
* mapper
*/
static class IdenticalCourseAndScorePeopleCountMapper extends Mapper<LongWritable, Text, Text, Text>{
Text mk = new Text();
Text mv = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 拿到每一行数据进行切割
String[] info = value.toString().split(",");
// 去除不规则数据
if (info.length == 3) {
// [course,name,score]
// key 为科目+分数
StringBuilder sbr = new StringBuilder();
mk.set(sbr.append(info[0].trim()).append("\t").append(info[info.length - 1].trim()).toString());
// value 为学生姓名
mv.set(info[1].trim());
// 发送 reduce 端
context.write(mk, mv);
}
}
}
/*
* reducer
*/
static class IdenticalCourseAndScorePeopleCountReducer extends Reducer<Text, Text, Text, Text>{
Text rv = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
int count = 0;
StringBuilder sbr = new StringBuilder();
for (Text name : values) {
// 累加人数
count++;
// 人名追加
sbr.append(name.toString()).append(",");
}
// 去除追加末尾的“,”,获取所有人名的拼接字符串
String names = sbr.substring(0, sbr.length() - 1);
// 输出 key: 课程 + 分数
// 输出 vlaue: 次数 + “\t” + 人名
rv.set(count + "\t" + names);
context.write(key, rv);
}
}
/*
* Driver
*/
public static void main(String[] args) throws Exception {
// 加载配置文件
Configuration conf = new Configuration();
// 启动一个 job
Job job = Job.getInstance(conf);
// 设置主类入口
job.setJarByClass(IdenticalCourseAndScorePeopleCount.class);
// 指定 MR 中 mapper 和 reducer 类
job.setMapperClass(IdenticalCourseAndScorePeopleCountMapper.class);
job.setReducerClass(IdenticalCourseAndScorePeopleCountReducer.class);
// 指定 mapper 和 reducer 输出泛型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 获取 HDFS 文件系统连接
FileSystem fs = FileSystem.get(conf);
// 指定测试文件路径
Path inpath = new Path(args[0]);
if (fs.exists(inpath)) {
FileInputFormat.addInputPath(job, inpath);
// 指定结果输出路径
Path outpath = new Path(args[1]);
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
} else {
System.out.println("指定输入文件路径不存在");
}
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
(4)将上面的统计结果按照科目不同 输出到不同的文件中
7、学习内容
上节学习内容:MapReduce 程序的核心运行机制
下节学习内容:MapReduce 常见的编程场景 1