MapReduce 编程作业

最新推荐文章于 2024-05-28 14:41:13 发布

.道不虚行

最新推荐文章于 2024-05-28 14:41:13 发布

阅读量729

点赞数

分类专栏： hadoop 文章标签： mapreduce

本文链接：https://blog.csdn.net/weixin_44387652/article/details/106460168

版权

hadoop 专栏收录该内容

53 篇文章 5 订阅

订阅专栏

MapReduce 编程作业

1、准备数据
2、需求分析1
3、实现1
3、需求分析2
4、实现2
5、需求分析3
6、实现3
7、学习内容

1、准备数据

科目，姓名，成绩
computer,huangxiaoming,85
computer,xuzheng,54
computer,huangbo,86
computer,liutao,85
computer,huanglei,99
computer,huangxiaoming,85
computer,xuzheng,54
computer,huangbo,86
computer,liujialing,85
computer,liuyifei,75
computer,huangdatou,48
computer,huangjiaju,88
computer,huangzitao,85
english,zhaobenshan,57
english,liuyifei,85
english,liuyifei,76
english,huangdatou,48
english,zhouqi,85
english,huangbo,85
english,huangxiaoming,96
english,huanglei,85
english,liujialing,75
algorithm,liuyifei,75
english,huangxiaoming,96
english,huanglei,85
english,liujialing,75
algorithm,liuyifei,75
algorithm,huanglei,76
algorithm,huangjiaju,85
algorithm,liutao,85
algorithm,huangzitao,81
math,wangbaoqiang,85
algorithm,42
algorithm,huangzitao,81
math,wangbaoqiang,85
math,huanglei,76
math,huangjiaju,85
math,liutao,48
math,huangjiaju,85
math,xuzheng,54
math,huangxiaoming,85
math,liujialing,85
math,huanglei,76
math,huangjiaju,85
math,liutao,48

以上所有的是数据，该数据每行有三个字段值，分别是course,name,score

2、需求分析1

（1）对以上的数据去重分组
相同的数据只取一个
思路1：
map端一行读一次
全部发给reduce
key：写死 “”
value：一整行内容
reduce端去重
将value进行放在set集合中
思路2：
运用mr 分组
相同的key分到一组
map端：
key：整条数据
value：Text | NullWritable
reduce端：
相同整条数据分到一组
reduce（一组中的一个key，values，context）{
key输出
}

3、实现1

package com.zc.hadoop.mapreduce.demo.studentscore;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/*
 * 1、去重
 */
public class DuplicateRemoval {
	/*
	 * Mapper
	 */
	static class DuplicateRemovalMapper extends Mapper<LongWritable, Text, Text, Text>{
		Text mk = new Text();
		Text mv = new Text();
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			mk.set(value);
			mv.set("");
			context.write(mk, mv);
		}
		
	}
	
	/*
	 * Reducer
	 */
	static class DuplicateRemovalReducer extends Reducer<Text, Text, Text, Text>{
		Text rv = new Text();
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			rv.set("");
			context.write(key, rv);
		}
		
	}
	
	/*
	 * Driver
	 */
	public static void main(String[] args) throws Exception {
		// 加载配置文件
		Configuration conf = new Configuration();
		
		// 启动一个 job
		Job job = Job.getInstance(conf);
		
		// 设置主类入口
		job.setJarByClass(DuplicateRemoval.class);
		
		// 指定 MR 中 mapper 和 reducer 类
		job.setMapperClass(DuplicateRemovalMapper.class);
		job.setReducerClass(DuplicateRemovalReducer.class);
		
		// 指定 mapper 和 reducer 输出泛型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		// 获取 HDFS 文件系统连接
		FileSystem fs = FileSystem.get(conf);
		// 指定测试文件路径
		Path inpath = new Path(args[0]);
		if (fs.exists(inpath)) {
			FileInputFormat.addInputPath(job, inpath);
			
			// 指定结果输出路径
			Path outpath = new Path(args[1]);
			if (fs.exists(outpath)) {
				fs.delete(outpath, true);
			}
			FileOutputFormat.setOutputPath(job, outpath);
			
		} else {
			System.out.println("指定输入文件路径不存在");
		}
		
		System.exit(job.waitForCompletion(true) ? 0 : 1);
		
	}
}

3、需求分析2

每一个course的最高分，最低分，平均分作业****
map key 分组
分组：每一个 course
map端：
key：course Text
value：分数 IntWritable

reduce端：
相同course的所有数据分到一组
reduce（科目，所有分数，context）{
循环遍历所有分数求最大最小平均
}

输出
key： 科目   Text
value:最大值+最小值+平均值 Text

4、实现2

package com.zc.hadoop.mapreduce.demo.studentscore;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.zc.hadoop.mapreduce.demo.studentscore.DuplicateRemoval.DuplicateRemovalMapper;
import com.zc.hadoop.mapreduce.demo.studentscore.DuplicateRemoval.DuplicateRemovalReducer;

/*
 * 2、每一个course的最高分，最低分，平均分
 */
public class CourseMaxMinAndAvgScore {
	
	/*
	 * mapper
	 */
	static class CourseMaxMinAndAvgScoreMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

		Text mk = new Text();
		IntWritable mv = new IntWritable();
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			// 拿到每一行数据进行分割
			String[] info = value.toString().split(",");
			// 去除不规则数据
			if (info.length == 3) {
				// [course,name,score]
				// key --> course,  value ---> score
				mk.set(info[0].trim());
				mv.set(Integer.parseInt(info[2].trim()));
				context.write(mk, mv);
			}
		}
		
	}
	
	/*
	 * reducer
	 */
	static class CourseMaxMinAndAvgScoreReducer extends Reducer<Text, IntWritable, Text, Text>{
		
		Text rk = new Text();
		Text rv = new Text();
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,Context context) 
				throws IOException, InterruptedException {
			// 按课程分组，成绩为迭代器
			int max = 0, min = 0, count = 0, sum = 0;
			for (IntWritable value : values) {
				int score = value.get();
				count++;
				if (count == 1) {
					max = min = score;
				}
				// 求最大值
				if (max < score) {
					max = score;
				}
				// 求最小值
				if (min > score) {
					min = score;
				}
				// 求和
				sum += score;
			}
			// 求平均分
			double avg = 1.0 * sum / count;
			
			rk.set(key);
			rv.set("max=" + max + "; min=" + min + "; avg=" + avg);
			context.write(rk, rv);
		}
		
	}

	public static void main(String[] args) throws Exception {
		// 加载配置文件
		Configuration conf = new Configuration();

		// 启动一个 job
		Job job = Job.getInstance(conf);

		// 设置主类入口
		job.setJarByClass(CourseMaxMinAndAvgScore.class);

		// 指定 MR 中 mapper 和 reducer 类
		job.setMapperClass(CourseMaxMinAndAvgScoreMapper.class);
		job.setReducerClass(CourseMaxMinAndAvgScoreReducer.class);

		// 指定 mapper 和 reducer 输出泛型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		// 获取 HDFS 文件系统连接
		FileSystem fs = FileSystem.get(conf);
		// 指定测试文件路径
		Path inpath = new Path(args[0]);
		if (fs.exists(inpath)) {
			FileInputFormat.addInputPath(job, inpath);

			// 指定结果输出路径
			Path outpath = new Path(args[1]);
			if (fs.exists(outpath)) {
				fs.delete(outpath, true);
			}
			FileOutputFormat.setOutputPath(job, outpath);

		} else {
			System.out.println("指定输入文件路径不存在");
		}

		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

5、需求分析3

求该成绩表当中出现相同分数的分数，还有次数，以及该分数的人数
求相同科目相同分数的有多少个人并且都是谁
返回结果的格式：
科目分数次数该分数的人
例子：
computer 85 3 huangzitao,liujialing,huangxiaoming

分组：科目分数

map端：
key：科目+分数 Text
value：姓名 Text
reduce端：
接受的相同科目和分数的所有人的姓名
reduce(科目+分数，所有的姓名，context){
计数迭代器
迭代器姓名拼接
}

key：Text
value：人数+拼接人名   Text

6、实现3

package com.zc.hadoop.mapreduce.demo.studentscore;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


/*
 * 3、求相同科目相同分数的有多少个人  并且都是谁
 */
public class IdenticalCourseAndScorePeopleCount {

	/*
	 * mapper
	 */
	static class IdenticalCourseAndScorePeopleCountMapper extends Mapper<LongWritable, Text, Text, Text>{

		Text mk = new Text();
		Text mv = new Text();
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			// 拿到每一行数据进行切割
			String[] info = value.toString().split(",");
			// 去除不规则数据
			if (info.length == 3) {
				// [course,name,score]
				// key 为科目+分数
				StringBuilder sbr = new StringBuilder();
				mk.set(sbr.append(info[0].trim()).append("\t").append(info[info.length - 1].trim()).toString());
				// value 为学生姓名
				mv.set(info[1].trim());
				// 发送 reduce 端
				context.write(mk, mv);
			}
		}
		
	}
	
	/*
	 * reducer
	 */
	static class IdenticalCourseAndScorePeopleCountReducer extends Reducer<Text, Text, Text, Text>{

		Text rv = new Text();
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			int count = 0;
			StringBuilder sbr = new StringBuilder();
			for (Text name : values) {
				// 累加人数
				count++;
				// 人名追加
				sbr.append(name.toString()).append(",");
			}
			// 去除追加末尾的“,”,获取所有人名的拼接字符串
			String names = sbr.substring(0, sbr.length() - 1);
			
			// 输出 key: 课程 + 分数
			// 输出 vlaue: 次数 + “\t” + 人名
			rv.set(count + "\t" + names);
			context.write(key, rv);
		}
		
	}
	
	/*
	 * Driver
	 */
	public static void main(String[] args) throws Exception {
		// 加载配置文件
		Configuration conf = new Configuration();
		
		// 启动一个 job
		Job job = Job.getInstance(conf);
		
		// 设置主类入口
		job.setJarByClass(IdenticalCourseAndScorePeopleCount.class);
		
		// 指定 MR 中 mapper 和 reducer 类
		job.setMapperClass(IdenticalCourseAndScorePeopleCountMapper.class);
		job.setReducerClass(IdenticalCourseAndScorePeopleCountReducer.class);
		
		// 指定 mapper 和 reducer 输出泛型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		// 获取 HDFS 文件系统连接
		FileSystem fs = FileSystem.get(conf);
		// 指定测试文件路径
		Path inpath = new Path(args[0]);
		if (fs.exists(inpath)) {
			FileInputFormat.addInputPath(job, inpath);
			
			// 指定结果输出路径
			Path outpath = new Path(args[1]);
			if (fs.exists(outpath)) {
				fs.delete(outpath, true);
			}
			FileOutputFormat.setOutputPath(job, outpath);
			
		} else {
			System.out.println("指定输入文件路径不存在");
		}
		
		System.exit(job.waitForCompletion(true) ? 0 : 1);
		
	}

}