题目要求:求出每月的最高温度
题目文件中内容:
2018010123.4 2018010133.4 2018010113.4 2018020426.8 2018050829.1 2018050713.4 2018070433.2 2018090123.4
注:具体流程,请参考workcount程序 ,我只展现代码
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.util.Iterator; public class GetMaxTemperature { public static class GetMaxTemperatureMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //转换成普通的字符串 String line = value.toString(); //切出月份和温度 String month = line.substring(0, 6); String temperature = line.substring(8); double t = Double.parseDouble(temperature); context.write(new Text(month), new DoubleWritable(Double.parseDouble(temperature))); } } public static class GetMaxTemperatureReduce extends Reducer<Text, DoubleWritable, Text, DoubleWritable> { @Override protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException { //遍历values,找出最大值 double temp = 0; Iterator<DoubleWritable> iterator = values.iterator(); while (iterator.hasNext()){ DoubleWritable doubleWritable = iterator.next(); if (temp<doubleWritable.get()){ temp = doubleWritable.get(); } } DoubleWritable doubleWritable = new DoubleWritable(temp); //输出月份和最大值 context.write(key, doubleWritable); } } //注册驱动,封装job对象,并提交给集群(单机测试) public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { //添加设置信息 Configuration conf = new Configuration(); //创建job,可以给job起个名字,也可以不起 Job job = Job.getInstance(conf, "getMaxTemperature"); //设置驱动类 -- 这里参数的名字必须是主类的名 job.setJarByClass(GetMaxTemperature.class); //设置mapper -- 这里的名字必须是对应的mapper类的名字 job.setMapperClass(GetMaxTemperatureMapper.class); //必须是map方法的输出的类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); //设置reducer -- 这里的名字必须是对应reducer类的名字 job.setReducerClass(GetMaxTemperatureReduce.class); //必须是reducer最终输出数据的类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); //设置的输入路径和输出路径 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); //提交job System.exit(job.waitForCompletion(true) ? 0 : 1); } }
注: 1、若是添加这两个文件,则需要开启linux中集群后才能正常运行,因为优先级core-site-default.xml<core-site.xml,并且他会在集群里跑,会输出到hdfs文件系统上,来源也是hdfs文件系统上的文件。
2、
//使用本地路径 file://input file://output //使用分布式系统 hdfs://master:8020/input hdfs://master:8020/output //使用分布式系统的服务器,使用本地的输入输出数据路径时出的错: // hdfs://master:8020/F:\ideaUProgram\mapReduce1\src\data
优化:使用bean 来存储month 和 temperature,这样做更方便直接 存和取 三个数据
package li; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.util.*; /* 题目:要求求出每个月的最高温度 2018010123.4 2018010133.4 2018010113.4 2018020426.8 2018050829.1 2018050713.4 2018070433.2 2018090123.4 */ public class GetMaxTemperature { public static class MyMapper extends Mapper<LongWritable, Text,Text, DoubleWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //转换成普通的字符串 String line = value.toString(); //切出月份和温度 String month = line.substring(0,6); String temp = line.substring(8); context.write(new Text(month),new DoubleWritable(Double.parseDouble(temp))); } } public static class MyReducer extends Reducer<Text,DoubleWritable,Text,DoubleWritable>{ //做的时初始化的工作,在reducer开始工作之前执行一次 @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); } //让list存储所有的月份和最高温度 ArrayList<Bean> list = new ArrayList(); @Override protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException { //遍历values,找出最大值 double temp = 0; for (DoubleWritable value : values) { if (temp < value.get()){ temp = value.get(); } } Bean bean = new Bean(key.toString(),temp); list.add(bean); System.out.println("reduce后:"+list); } //收尾方法 //在reduce方法之后执行 @Override protected void cleanup(Context context) throws IOException, InterruptedException { //排序,求top3,输出 list.sort(new Comparator<Bean>() { @Override public int compare(Bean o1, Bean o2) { return o1.compareTo(o2); //Bean利用了WritableComparable接口进行了序列化 需要实现compareTo方法 故此直接调用就可以 } }); //遍历输出前三个数据 for (int i = 0; i < 3; i++) { Text text = new Text(list.get(i).getMonth()); DoubleWritable d = new DoubleWritable(list.get(i).getTemp()); context.write(text,d); } } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { //添加设置信息 Configuration configuration = new Configuration(); //设置访问服务器时的用户名字 //System.setProperty("HADOOP_USER_NAME", "root"); configuration.set("fs.defaultFS","file:///"); //创建job Job job = Job.getInstance(configuration,"maxtemprautre"); //设置主类 job.setJarByClass(GetMaxTemperature.class); //设置mapper job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); //设置reducer job.setReducerClass(MyReducer.class); //设置输出数据的类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); //设置输入输出 FileInputFormat.setInputPaths(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); //设置提交 System.exit(job.waitForCompletion(true)?0:1); } }
Bean.java
package li; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; //让Bean实现序列化 //同时要保证Bean里面的属性都要支持序列化和反序列化 public class Bean implements WritableComparable<Bean> { private String month; private double temp; public Bean(){} public Bean(String month, double temp) { this.month = month; this.temp = temp; } @Override public String toString() { return "Bean{" + "month='" + month + '\'' + ", temp=" + temp + '}'; } public String getMonth() { return month; } public void setMonth(String month) { this.month = month; } public double getTemp() { return temp; } public void setTemp(double temp) { this.temp = temp; } //序列化 @Override public void write(DataOutput dataOutput) throws IOException { dataOutput.writeUTF(month); dataOutput.writeDouble(temp); } //逆序列化 @Override public void readFields(DataInput dataInput) throws IOException { this.month = dataInput.readUTF(); this.temp = dataInput.readDouble(); } @Override public int compareTo(Bean o) { return (int)Math.ceil( o.getTemp()-this.getTemp()); } }