Hadoop案例之年份温度排序
1. 输入
输入文件内容:
1949-10-01 14:21:02 34C
1949-10-02 14:21:12 36C
1950-02-02 11:21:12 32C
1950-05-02 11:31:12 37C
1951-12-02 11:31:12 23C
1950-12-02 11:31:12 47C
1950-12-02 11:31:12 27C
1951-06-02 11:31:12 48C
1951-07-02 11:31:12 45C
数据是时间和温度的记录
其中年份yyyy-MM-dd HH:mm:ss后面跟的是制表符tab
2. 输出
计算出:
1949-1951 年之间,每年温度最高的前K天(例如k=5)
3. 思路
1.按照年份升序排序,同一年中,再按照温度降序排序。
2.按照年份分组,每个年份对应一个reduce任务。
map输出: key为封装对象,封装年份和温度为一个对象作为Key值,即keyPair类。
4. 目的
4.1 练习自定义排序
4.2 练习自定义分区
4.3 练习自定义分组
5. 各种软件环境
参考Hadoop2.5.2完全分布式环境搭建
http://blog.csdn.net/liushahe2012/article/details/53364449
6. 程序实现
6.1 keyPair类实现
package com.hadoop;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class keyPair implements WritableComparable<keyPair>{
//IntWritable a = new IntWritable(2);
private int year;
private int temperature;
public int getYear() {
return year;
}
public void setYear(int year) {
this.year = year;
}
public int getTemperature() {
return temperature;
}
public void setTemperature(int temperature) {
this.temperature = temperature;
}
//重写如下三个方法
@Override
public void readFields(DataInput in) throws IOException {
//使用RPC协议读取二进制流,反序列化过程
this.year = in.readInt();
this.temperature = in.readInt();
}
@Override
public void write(DataOutput out) throws IOException {
//使用RPC协议读取二进制流,序列化过程
out.writeInt(year);
out.writeInt(temperature);
}
@Override
public int compareTo(keyPair o) {
int iRet = Integer.compare(year, o.getYear());
if(iRet != 0 )
{
return iRet;
}
return Integer.compare(temperature, o.getTemperature());
}
//另外,还需重写tostring
@Override
public String toString() {
return year + "\t" + temperature;
}
//重写hashcode
@Override
public int hashCode() {
return new Integer(year+temperature).hashCode();
}
}
6.2 自定义排序类Sort
package com.hadoop;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class Sort extends WritableComparator{
public Sort()
{
super(keyPair.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
keyPair k1 = (keyPair)a;
keyPair k2 = (keyPair)b;
//Integer.compare默认就是升序排序
int iRet = Integer.compare(k1.getYear(), k2.getYear());
if(iRet != 0)
{
return iRet;
}
//温度降序排序
return Integer.compare(k2.getTemperature(), k1.getTemperature());
}
}
6.3 自定义分区类Partition
package com.hadoop;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class Partition extendsPartitioner<keyPair, Text>{
@Override
//自定义分区方法
//num是reduce的数量
publicint getPartition(keyPair key, Text value, int num) {
//安装年份分区,年份保存在key中
//年份乘以常数,在对num取模
return(key.getYear() * 127) % num;
}
}
6.4 自定义分组类Group
package com.hadoop;
importorg.apache.hadoop.io.WritableComparable;
importorg.apache.hadoop.io.WritableComparator;
public class Group extendsWritableComparator{
public Group() {
super(keyPair.class,true);
}
//reduce的二次排序阶段,根据year值进行分组
publicint compare(WritableComparable a, WritableComparable b) {
keyPairk1 = (keyPair) a;
keyPairk2 = (keyPair) b;
returnInteger.compare(k1.getYear(), k2.getYear());
}
}
6.5 主程序RunJob
package com.hadoop;
import java.io.IOException;
import java.util.Calendar;
import java.util.Date;
import java.text.SimpleDateFormat;
importorg.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class RunJob {
staticSimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
staticclass MapperJob extends Mapper<LongWritable, Text, keyPair, Text>
{
@Override
protectedvoid map(LongWritable key, Text value,
Contextcontext)
throwsIOException, InterruptedException {
//处理读取的一行数据
Stringline = value.toString();
//每一行数据由制表符分割,所以需要分割
String[]ss = line.split("\t");
//只处理符合条件的数据
if(ss.length == 2) {
try{
//解析年份
Datedate = sdf.parse(ss[0]);
Calendarc = Calendar.getInstance();
c.setTime(date);
//c.get(1)就是年份
intyear = c.get(1);
//解析温度
Stringt = ss[1].substring(0, ss[1].indexOf("C"));
//创建复合对象键值keyPair
keyPairk = new keyPair();
k.setYear(year);
k.setTemperature(Integer.parseInt(t));
//mapper写输出
context.write(k,new Text(t));
}catch (Exception e) {
e.printStackTrace();
}
}
}
}
staticclass ReducerJob extends Reducer<keyPair, Text, keyPair, Text>
{
protectedvoid reduce(keyPair key, Iterable<Text> value, Context context)
throwsIOException, InterruptedException {
//直接输出
for(Textv: value)
{
context.write(key,v);
}
}
}
publicstatic void main(String[] args) {
Configurationconf = new Configuration();
try
{
Jobjob = new Job(conf);
job.setJobName("year_temperature");
job.setJarByClass(RunJob.class);
job.setMapperClass(MapperJob.class);
job.setReducerClass(ReducerJob.class);
job.setMapOutputKeyClass(keyPair.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(3);
job.setPartitionerClass(Partition.class);
job.setSortComparatorClass(Sort.class);
job.setGroupingComparatorClass(Group.class);
//输入输出文件路径
FileInputFormat.addInputPath(job,new Path("/usr/local/hadooptempdata/input/year-temp/"));
FileOutputFormat.setOutputPath(job,new Path("/usr/local/hadooptempdata/output/year-temp/"));
System.exit(job.waitForCompletion(true)?0 : 1);
}
catch(Exceptione){
e.printStackTrace();
}
}
}
7. 程序执行
7.1 上传jar包
将程序打成jar包,并上传到后台节点:
root@node1:/usr/local/hadoop/hadoop-2.5.2/myJar#ll
total 20
drwxr-xr-x 2 root root 4096 Dec 16 22:20 ./
drwxr-xr-x 11 10021 10021 4096 Dec 16 22:19 ../
-rw-r--r-- 1 root root 10201 Dec 16 22:22year_temp.jar
7.2 上传输入文件data到hdfs文件系统
生成输入文件路径:
root@node1:/usr/local/hadoop/hadoop-2.5.2/myJar#hdfs dfs -mkdir -p/usr/local/hadooptempdata/input/year-temp
上传文件:
root@node1:/usr/local/hadoop/hadoop-2.5.2/myJar#hdfs dfs -put /usr/local/data/usr/local/hadooptempdata/input/year-temp
7.3 执行案例
root@node1:/usr/local/hadoop/hadoop-2.5.2/myJar#hadoop jar year_temp.jar com.hadoop.RunJob
16/12/17 21:15:18 INFO client.RMProxy:Connecting to ResourceManager at node1/192.168.233.129:8032
16/12/17 21:15:20 WARNmapreduce.JobSubmitter: Hadoop command-line option parsing not performed.Implement the Tool interface and execute your application with ToolRunner toremedy this.
16/12/17 21:15:21 INFOinput.FileInputFormat: Total input paths to process : 1
16/12/17 21:15:22 INFOmapreduce.JobSubmitter: number of splits:1
16/12/17 21:15:22 INFOmapreduce.JobSubmitter: Submitting tokens for job: job_1481977257660_0004
16/12/17 21:15:23 INFO impl.YarnClientImpl:Submitted application application_1481977257660_0004
16/12/17 21:15:24 INFO mapreduce.Job: Theurl to track the job: http://node1:8088/proxy/application_1481977257660_0004/
16/12/17 21:15:24 INFO mapreduce.Job:Running job: job_1481977257660_0004
16/12/17 21:16:06 INFO mapreduce.Job: Jobjob_1481977257660_0004 running in uber mode : false
16/12/17 21:16:06 INFO mapreduce.Job: map 0% reduce 0%
16/12/17 21:16:46 INFO mapreduce.Job: map 100% reduce 0%
16/12/17 21:17:14 INFO mapreduce.Job: map 100% reduce 33%
16/12/17 21:17:26 INFO mapreduce.Job: map 100% reduce 67%
16/12/17 21:17:29 INFO mapreduce.Job: map 100% reduce 100%
16/12/17 21:17:30 INFO mapreduce.Job: Jobjob_1481977257660_0004 completed successfully
16/12/17 21:17:30 INFO mapreduce.Job:Counters: 50
FileSystem Counters
FILE:Number of bytes read=135
FILE:Number of bytes written=396509
FILE:Number of read operations=0
FILE:Number of large read operations=0
FILE:Number of write operations=0
HDFS:Number of bytes read=385
HDFS:Number of bytes written=99
HDFS:Number of read operations=12
HDFS:Number of large read operations=0
HDFS:Number of write operations=6
JobCounters
Killedreduce tasks=1
Launchedmap tasks=1
Launchedreduce tasks=4
Data-localmap tasks=1
Totaltime spent by all maps in occupied slots (ms)=36763
Totaltime spent by all reduces in occupied slots (ms)=108155
Totaltime spent by all map tasks (ms)=36763
Totaltime spent by all reduce tasks (ms)=108155
Totalvcore-seconds taken by all map tasks=36763
Totalvcore-seconds taken by all reduce tasks=108155
Totalmegabyte-seconds taken by all map tasks=37645312
Totalmegabyte-seconds taken by all reduce tasks=110750720
Map-ReduceFramework
Mapinput records=10
Mapoutput records=9
Mapoutput bytes=99
Mapoutput materialized bytes=135
Inputsplit bytes=127
Combineinput records=0
Combineoutput records=0
Reduceinput groups=3
Reduceshuffle bytes=135
Reduceinput records=9
Reduceoutput records=9
SpilledRecords=18
ShuffledMaps =3
FailedShuffles=0
MergedMap outputs=3
GCtime elapsed (ms)=796
CPUtime spent (ms)=8770
Physicalmemory (bytes) snapshot=502833152
Virtualmemory (bytes) snapshot=7552282624
Totalcommitted heap usage (bytes)=174166016
ShuffleErrors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
FileInput Format Counters
BytesRead=258
FileOutput Format Counters
BytesWritten=99
7.4 从前台查看mapReduce任务
登录ResourceManager所在的机器查看执行过程
7.5 查看输出结果
root@node1:/usr/local/hadoop/hadoop-2.5.2/myJar#hdfs dfs -cat/usr/local/hadooptempdata/output/year-temp/*
1950 47 47
1950 37 37
1950 32 32
1950 27 27
1951 48 48
1951 45 45
1951 23 23
1949 36 36
1949 34 34