Hadoop案例之年份温度排序

Hadoop案例之年份温度排序

 

1.  输入

输入文件内容:

1949-10-01 14:21:02 34C

1949-10-02 14:21:12 36C

1950-02-02 11:21:12 32C

1950-05-02 11:31:12 37C

1951-12-02 11:31:12 23C

1950-12-02 11:31:12 47C

1950-12-02 11:31:12 27C

1951-06-02 11:31:12 48C

1951-07-02 11:31:12 45C

数据是时间和温度的记录

其中年份yyyy-MM-dd HH:mm:ss后面跟的是制表符tab

 

2. 输出

计算出:

1949-1951 年之间,每年温度最高的前K天(例如k=5)

 

3. 思路

1.按照年份升序排序,同一年中,再按照温度降序排序。

2.按照年份分组,每个年份对应一个reduce任务。

 

map输出: key为封装对象,封装年份和温度为一个对象作为Key值,即keyPair类。

 

4. 目的

4.1 练习自定义排序

4.2 练习自定义分区

4.3 练习自定义分组

 

5. 各种软件环境

参考Hadoop2.5.2完全分布式环境搭建

http://blog.csdn.net/liushahe2012/article/details/53364449

 

 

6. 程序实现

 

6.1 keyPair类实现

package com.hadoop;

 

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

 

import org.apache.hadoop.io.WritableComparable;

public class keyPair implements WritableComparable<keyPair>{

  

   //IntWritable a = new IntWritable(2);

   private int year;

   private int temperature;

   public int getYear() {

     return year;

   }

   public void setYear(int year) {

     this.year = year;

   }

   public int getTemperature() {

     return temperature;

   }

   public void setTemperature(int temperature) {

     this.temperature = temperature;

   }

   //重写如下三个方法

   @Override

   public void readFields(DataInput in) throws IOException {

     //使用RPC协议读取二进制流,反序列化过程

     this.year = in.readInt();

     this.temperature = in.readInt();

   }

   @Override

   public void write(DataOutput out) throws IOException {

     //使用RPC协议读取二进制流,序列化过程

     out.writeInt(year);

     out.writeInt(temperature);

   }

   @Override

   public int compareTo(keyPair o) {

     int iRet = Integer.compare(year, o.getYear());

     if(iRet != 0 )

     {

        return iRet;

     }

     return Integer.compare(temperature, o.getTemperature());

   }

   //另外,还需重写tostring

   @Override

   public String toString() {

     return year + "\t" + temperature;

   }

   //重写hashcode

   @Override

   public int hashCode() {

     return new Integer(year+temperature).hashCode();

   }

}

 

 

6.2 自定义排序类Sort

 

package com.hadoop;

 

import org.apache.hadoop.io.WritableComparable;

import org.apache.hadoop.io.WritableComparator;

 

public class Sort extends WritableComparator{

 

   public Sort()

   {

     super(keyPair.class,true);

   }

  

   @Override

   public int compare(WritableComparable a, WritableComparable b) {

 

     keyPair k1 = (keyPair)a;

     keyPair k2 = (keyPair)b;

    

     //Integer.compare默认就是升序排序

     int iRet = Integer.compare(k1.getYear(), k2.getYear());

     if(iRet != 0)

     {

        return iRet;

     }

     //温度降序排序

     return Integer.compare(k2.getTemperature(), k1.getTemperature());

   }

}

 

 

6.3 自定义分区类Partition

 

package com.hadoop;

 

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Partitioner;

 

public class Partition extendsPartitioner<keyPair, Text>{

 

         @Override

         //自定义分区方法

         //num是reduce的数量

         publicint getPartition(keyPair key, Text value, int num) {

                   //安装年份分区,年份保存在key中

                   //年份乘以常数,在对num取模

                   return(key.getYear() * 127) % num;

         }

}

6.4 自定义分组类Group

 

package com.hadoop;

 

importorg.apache.hadoop.io.WritableComparable;

importorg.apache.hadoop.io.WritableComparator;

 

public class Group extendsWritableComparator{

 

          public Group() {

                   super(keyPair.class,true);

         }

        

          //reduce的二次排序阶段,根据year值进行分组

         publicint compare(WritableComparable a, WritableComparable b) {

                   keyPairk1 = (keyPair) a;

                   keyPairk2 = (keyPair) b;

                   returnInteger.compare(k1.getYear(), k2.getYear());

         }

 

}

 

6.5 主程序RunJob

 

package com.hadoop;

 

import java.io.IOException;

import java.util.Calendar;

import java.util.Date;

import java.text.SimpleDateFormat;

 

importorg.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

 

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 

public class RunJob {

 

         staticSimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

        

         staticclass MapperJob extends Mapper<LongWritable, Text, keyPair, Text>

         {

 

                   @Override

                   protectedvoid map(LongWritable key, Text value,

                                     Contextcontext)

                                     throwsIOException, InterruptedException {

 

                            //处理读取的一行数据

                            Stringline = value.toString();

                            //每一行数据由制表符分割,所以需要分割

                            String[]ss = line.split("\t");

                           

                            //只处理符合条件的数据

                            if(ss.length == 2) {

                                     try{

                                               //解析年份

                                               Datedate = sdf.parse(ss[0]);

                                               Calendarc = Calendar.getInstance();

                                               c.setTime(date);

                                               //c.get(1)就是年份

                                               intyear = c.get(1);

                                              

                                               //解析温度

                                               Stringt = ss[1].substring(0, ss[1].indexOf("C"));

                                              

                                               //创建复合对象键值keyPair

                                               keyPairk = new keyPair();

                                               k.setYear(year);

                                               k.setTemperature(Integer.parseInt(t));

                                              

                                               //mapper写输出

                                               context.write(k,new Text(t));

                                     }catch (Exception e) {

                                               e.printStackTrace();

                                     }

                            }

                           

                   }

         }

        

         staticclass ReducerJob extends Reducer<keyPair, Text, keyPair, Text>

         {

                  

                   protectedvoid reduce(keyPair key, Iterable<Text> value, Context context)

                                     throwsIOException, InterruptedException {

                            //直接输出

                            for(Textv: value)

                            {

                                     context.write(key,v);

                            }

                   }

                  

         }

        

         publicstatic void main(String[] args) {

                   Configurationconf = new Configuration();

                   try

                   {

                            Jobjob = new Job(conf);

                            job.setJobName("year_temperature");

                            job.setJarByClass(RunJob.class);

                            job.setMapperClass(MapperJob.class);

                            job.setReducerClass(ReducerJob.class);

                            job.setMapOutputKeyClass(keyPair.class);

                            job.setMapOutputValueClass(Text.class);

                           

                            job.setNumReduceTasks(3);

                            job.setPartitionerClass(Partition.class);

                            job.setSortComparatorClass(Sort.class);

                            job.setGroupingComparatorClass(Group.class);

                           

                            //输入输出文件路径

                            FileInputFormat.addInputPath(job,new Path("/usr/local/hadooptempdata/input/year-temp/"));

                            FileOutputFormat.setOutputPath(job,new Path("/usr/local/hadooptempdata/output/year-temp/"));

                            System.exit(job.waitForCompletion(true)?0 : 1);

                   }

                   catch(Exceptione){

                            e.printStackTrace();

                   }

         }

}

 

 

 

7. 程序执行

7.1 上传jar包

将程序打成jar包,并上传到后台节点:

root@node1:/usr/local/hadoop/hadoop-2.5.2/myJar#ll

total 20

drwxr-xr-x 2 root  root   4096 Dec 16 22:20 ./

drwxr-xr-x 11 10021 10021  4096 Dec 16 22:19 ../

-rw-r--r--  1 root root  10201 Dec 16 22:22year_temp.jar

 

7.2 上传输入文件data到hdfs文件系统

生成输入文件路径:

root@node1:/usr/local/hadoop/hadoop-2.5.2/myJar#hdfs dfs -mkdir -p/usr/local/hadooptempdata/input/year-temp

 

上传文件:

root@node1:/usr/local/hadoop/hadoop-2.5.2/myJar#hdfs dfs -put /usr/local/data/usr/local/hadooptempdata/input/year-temp

 

 

7.3 执行案例

root@node1:/usr/local/hadoop/hadoop-2.5.2/myJar#hadoop jar year_temp.jar com.hadoop.RunJob

16/12/17 21:15:18 INFO client.RMProxy:Connecting to ResourceManager at node1/192.168.233.129:8032

16/12/17 21:15:20 WARNmapreduce.JobSubmitter: Hadoop command-line option parsing not performed.Implement the Tool interface and execute your application with ToolRunner toremedy this.

16/12/17 21:15:21 INFOinput.FileInputFormat: Total input paths to process : 1

16/12/17 21:15:22 INFOmapreduce.JobSubmitter: number of splits:1

16/12/17 21:15:22 INFOmapreduce.JobSubmitter: Submitting tokens for job: job_1481977257660_0004

16/12/17 21:15:23 INFO impl.YarnClientImpl:Submitted application application_1481977257660_0004

16/12/17 21:15:24 INFO mapreduce.Job: Theurl to track the job: http://node1:8088/proxy/application_1481977257660_0004/

16/12/17 21:15:24 INFO mapreduce.Job:Running job: job_1481977257660_0004

16/12/17 21:16:06 INFO mapreduce.Job: Jobjob_1481977257660_0004 running in uber mode : false

16/12/17 21:16:06 INFO mapreduce.Job:  map 0% reduce 0%

16/12/17 21:16:46 INFO mapreduce.Job:  map 100% reduce 0%

16/12/17 21:17:14 INFO mapreduce.Job:  map 100% reduce 33%

16/12/17 21:17:26 INFO mapreduce.Job:  map 100% reduce 67%

16/12/17 21:17:29 INFO mapreduce.Job:  map 100% reduce 100%

16/12/17 21:17:30 INFO mapreduce.Job: Jobjob_1481977257660_0004 completed successfully

16/12/17 21:17:30 INFO mapreduce.Job:Counters: 50

         FileSystem Counters

                   FILE:Number of bytes read=135

                   FILE:Number of bytes written=396509

                   FILE:Number of read operations=0

                   FILE:Number of large read operations=0

                   FILE:Number of write operations=0

                   HDFS:Number of bytes read=385

                   HDFS:Number of bytes written=99

                   HDFS:Number of read operations=12

                   HDFS:Number of large read operations=0

                   HDFS:Number of write operations=6

         JobCounters

                   Killedreduce tasks=1

                   Launchedmap tasks=1

                   Launchedreduce tasks=4

                   Data-localmap tasks=1

                   Totaltime spent by all maps in occupied slots (ms)=36763

                   Totaltime spent by all reduces in occupied slots (ms)=108155

                   Totaltime spent by all map tasks (ms)=36763

                   Totaltime spent by all reduce tasks (ms)=108155

                   Totalvcore-seconds taken by all map tasks=36763

                   Totalvcore-seconds taken by all reduce tasks=108155

                   Totalmegabyte-seconds taken by all map tasks=37645312

                   Totalmegabyte-seconds taken by all reduce tasks=110750720

         Map-ReduceFramework

                   Mapinput records=10

                   Mapoutput records=9

                   Mapoutput bytes=99

                   Mapoutput materialized bytes=135

                   Inputsplit bytes=127

                   Combineinput records=0

                   Combineoutput records=0

                   Reduceinput groups=3

                   Reduceshuffle bytes=135

                   Reduceinput records=9

                   Reduceoutput records=9

                   SpilledRecords=18

                   ShuffledMaps =3

                   FailedShuffles=0

                   MergedMap outputs=3

                   GCtime elapsed (ms)=796

                   CPUtime spent (ms)=8770

                   Physicalmemory (bytes) snapshot=502833152

                   Virtualmemory (bytes) snapshot=7552282624

                   Totalcommitted heap usage (bytes)=174166016

         ShuffleErrors

                   BAD_ID=0

                   CONNECTION=0

                   IO_ERROR=0

                   WRONG_LENGTH=0

                   WRONG_MAP=0

                   WRONG_REDUCE=0

         FileInput Format Counters

                   BytesRead=258

         FileOutput Format Counters

                   BytesWritten=99

 

 

7.4 从前台查看mapReduce任务

登录ResourceManager所在的机器查看执行过程

 

 

7.5 查看输出结果

root@node1:/usr/local/hadoop/hadoop-2.5.2/myJar#hdfs dfs -cat/usr/local/hadooptempdata/output/year-temp/*

1950         47     47

1950         37     37

1950         32     32

1950         27     27

1951         48     48

1951         45     45

1951         23     23

1949         36     36

1949         34     34

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值