MapReduce的几个企业级经典面试案例
一、官方统计案例:
- 要求:统计一下单词出现的次数
-
测试数据:
zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin zhangyong zhangrui zhangqin
-
编写代码:
-
mapper类
/** * @author 17616 */ public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> { @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 首先获取一行数据 String line = value.toString (); // 将行内的单词进行切分,使用一个数组进行保存,切分数据时根据源数据得知可以使用空格的方式切分。 String[] arr = line.split (" "); for (String str : arr) { context.write (new Text (str), new LongWritable (1)); } } }
-
reducer类
/** * @author 17616 */ public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> { @Override public void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { // 定义变量记录单词出现的次数 long sum = 0; for (LongWritable val : values) { // 记录总次数 sum += val.get (); } // 输出数据,key就是单词,value就是在map阶段这个单词出现的总次数 context.write (key, new LongWritable (sum)); } }
-
Driver类
/** * @author 17616 * 官方案例,计算统计 */ public class WordCountDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // 获取当前的默认配置 Configuration conf = new Configuration (); // 获取代表当前mr作业的job对象 Job job = Job.getInstance (conf); // 指定一下当前程序的入口类 job.setJarByClass (WordCountDriver.class); //指定当前Mapper、Reducer任务的类 job.setMapperClass (WordCountMapper.class); job.setReducerClass (WordCountReducer.class); //设置Mapper的结果类型 job.setMapOutputKeyClass (Text.class); job.setMapOutputValueClass (LongWritable.class); // 设置Reducer的结果类型 job.setOutputKeyClass (Text.class); job.setOutputValueClass (LongWritable.class); //设置待分析的文件夹路径(linux的路径地址) FileInputFormat.setInputPaths (job, new Path ("hdfs://anshun115:9000/mapreduce")); FileOutputFormat.setOutputPath (job, new Path ("hdfs://anshun115:9000/result/mapreduce")); if (!job.waitForCompletion (true)) { return; } } }
-
-
运行结果:
zhangqin 20 zhangrui 20 zhangyong 20
二、计算平均值:
-
要求:计算一下数据的平均值
-
测试数据:
tom 69 tom 84 tom 68 jary 89 jary 90 jary 81 jary 35 alex 23 alex 100 alex 230
-
编写代码:
-
mapper类
/** * @Author zhangyong * @Date 2020/4/3 23:43 * @Version 1.0 */ public class AverageMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //获取每行的数据内容 String line = value.toString (); //按照空格去切会获取到多个数据,所以用数组的方式存储 String[] data = line.split (" "); String name = data[0]; //Integer做一个数据类型的强制转换。 int score = Integer.parseInt (data[1]); //输出数据 context.write (new Text (name), new IntWritable (score)); } }
-
reducer类
/** * @Author zhangyong * @Date 2020/4/3 23:43 * @Version 1.0 */ public class AverageReducer extends Reducer<Text, IntWritable, Text, IntWritable> { @Override public void reduce(Text name, Iterable<IntWritable> scores, Context context) throws IOException, InterruptedException { int i = 0; int score = 0; for (IntWritable data : scores) { score = score + data.get (); i++; } int average = score / i; context.write (name, new IntWritable (average)); } }
-
Driver类
/** * @Author zhangyong * @Date 2020/4/3 23:41 * @Version 1.0 * 计算平均值 */ public class AverageDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration (); Job job = Job.getInstance (conf); //驱动类,入口类 job.setJarByClass (AverageDriver.class); //设置Mapper和Reducer的类 job.setMapperClass (AverageMapper.class); job.setReducerClass (AverageReducer.class); //设置Mapper的结果类型 job.setMapOutputKeyClass (Text.class); job.setMapOutputValueClass (IntWritable.class); //设置Reduce的结果类型 job.setOutputKeyClass (Text.class); job.setOutputValueClass (IntWritable.class); //设置待分析的文件夹路径(linux的路径地址) FileInputFormat.setInputPaths (job, new Path ("hdfs://anshun115:9000/average")); FileOutputFormat.setOutputPath (job, new Path ("hdfs://anshun115:9000/result/average")); //提交到job job.waitForCompletion (true); } }
-
-
运行结果:
alex 117 jary 73 tom 73
-
三、求温度最高值:
- 要求:求出一下年限的时间的最高温度
-
测试数据:
2329999919500515070000 9909999919500515120022 9909999919500515180011 9509999919490324120111 6509999919490324180078 9909999919370515070001 9909999919370515120002 9909999919450515180001 6509999919450324120002 8509999919450324180078
-
编写代码:
-
mapper类
/** * @author 17616 */ public class HeightMapper extends Mapper<LongWritable, Text, Text, LongWritable> { @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //获取一段数据 String line = value.toString (); //获取年份 String year = line.substring (8, 12); //获取温度(强制转换一下) int t = Integer.parseInt (line.substring (18, 22)); context.write (new Text (year),new LongWritable (t)); } }
-
reducer类
/** * @author 17616 */ public class HeightReducer extends Reducer<Text, LongWritable, Text, LongWritable> { @Override public void reduce(Text year, Iterable<LongWritable> t, Context context) throws IOException, InterruptedException { long max = 0; for (LongWritable data : t) { if (max < data.get ()) { max = data.get (); } } context.write (year, new LongWritable (max)); } }
-
Driver类
/** * @author 17616 * -求最大值 */ public class HeightDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // 获取当前的默认配置 Configuration conf = new Configuration (); // 获取代表当前mr作业的job对象 Job job = Job.getInstance (conf); // 指定一下当前程序的入口类 job.setJarByClass (HeightDriver.class); //指定当前Mapper、Reducer任务的类 job.setMapperClass (HeightMapper.class); job.setReducerClass (HeightReducer.class); //设置Mapper的结果类型 job.setMapOutputKeyClass (Text.class); job.setMapOutputValueClass (LongWritable.class); // 设置Reducer的结果类型 job.setOutputKeyClass (Text.class); job.setOutputValueClass (LongWritable.class); //设置待分析的文件夹路径(linux的路径地址) FileInputFormat.setInputPaths (job, new Path ("hdfs://anshun115:9000/wendu/")); FileOutputFormat.setOutputPath (job, new Path ("hdfs://anshun115:9000/result/wendu")); job.waitForCompletion (true); } }
-
-
运行结果:
1937 2 1945 78 1949 111 1950 22
四、数据去重:
- 要求:去重一下ip地址
-
测试数据:
192.168.234.21 192.168.234.22 192.168.234.21 192.168.234.21 192.168.234.23 192.168.234.21 192.168.234.21 192.168.234.21 192.168.234.25 192.168.234.21 192.168.234.21 192.168.234.26 192.168.234.21 192.168.234.27 192.168.234.21 192.168.234.27 192.168.234.21 192.168.234.29 192.168.234.21 192.168.234.26 192.168.234.21 192.168.234.25 192.168.234.25 192.168.234.25 192.168.234.21 192.168.234.22 192.168.234.21
-
编写代码:
-
mapper类
/** * @Author zhangyong * @Date 2020/4/7 19:53 * @Version 1.0 */ public class DisMapper extends Mapper<LongWritable,Text,Text,NullWritable> { @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { /** * 其中value只是一个变量,此处被当做key进行输出 */ context.write (value,NullWritable.get ()); } }
-
reducer类
/** * @Author zhangyong * @Date 2020/4/7 21:21 * @Version 1.0 */ public class DisReducer extends Reducer<Text, NullWritable, Text, NullWritable> { @Override public void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException { context.write (key, NullWritable.get ()); } }
-
Driver类
/** * @Author zhangyong * @Date 2020/4/7 21:32 * @Version 1.0 * 数据去重 */ public class DisDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration (); Job job = Job.getInstance (conf); //设置Drive类 job.setJarByClass (DisReducer.class); //设置Mapper、Reduce类 job.setMapperClass (DisMapper.class); job.setReducerClass (DisReducer.class); //Mapper的输出 job.setMapOutputKeyClass (Text.class); job.setMapOutputValueClass (NullWritable.class); //地址 FileInputFormat.setInputPaths (job,new Path ("hdfs://anshun115:9000/distinct")); FileOutputFormat.setOutputPath (job,
-