description
航空数据由头部标题和数据部分组成,数据部分每行为一条记录,每列标示的信息与表头相对应,记录包含了某一年每个月的飞机飞行数据.现需将该文件中的记录按照月份划分为12个文件.
file content
Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
1995,1,6,5,657,645,952,937,UA,482,N7298U,115,112,83,15,12,ORD,PHL,678,7,25,0,NA,0,NA,NA,NA,NA,NA
1995,1,7,6,648,645,938,937,UA,482,N7449U,110,112,88,1,3,ORD,PHL,678,5,17,0,NA,0,NA,NA,NA,NA,NA
1995,1,8,7,649,645,932,937,UA,482,N7453U,103,112,83,-5,4,ORD,PHL,678,3,17,0,NA,0,NA,NA,NA,NA,NA
1995,1,9,1,645,645,928,937,UA,482,N7288U,103,112,84,-9,0,ORD,PHL,678,3,16,0,NA,0,NA,NA,NA,NA,NA
1995,1,10,2,645,645,931,937,UA,482,N7275U,106,112,82,-6,0,ORD,PHL,678,6,18,0,NA,0,NA,NA,NA,NA,NA
1995,1,11,3,646,645,929,937,UA,482,N7281U,103,112,85,-8,1,ORD,PHL,678,5,13,0,NA,0,NA,NA,NA,NA,NA
1995,1,12,4,NA,645,NA,937,UA,482,UNKNOW,NA,112,45,NA,NA,ORD,PHL,678,6,10,1,NA,0,NA,NA,NA,NA,NA
1995,1,13,5,644,645,953,937,UA,482,N7257U,129,112,110,16,-1,ORD,PHL,678,5,14,0,NA,0,NA,NA,NA,NA,NA
1995,1,14,6,644,645,938,937,UA,482,N7282U,114,112,94,1,-1,ORD,PHL,678,5,15,0,NA,0,NA,NA,NA,NA,NA
sourece code
package org.apress.prohadoop.c3;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
/*
* 将航空数据文件根据月份划分成多个文件
* 每个月份对应一个文件
*/
public class SplitByMonthMRJob extends Configured implements Tool {
public static class SplitByMonthMapper extends Mapper<LongWritable, Text, IntWritable, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//跳过文件头部标题
if (AirlineDataUtils.isHeader(value)) {
return;
}
String[] fields = value.toString().split(",");
int month = Integer.parseInt(AirlineDataUtils.getMonth(fields));
context.write(new IntWritable(month), value);
}
}
public static class SplitByMonthPartitioner extends Partitioner<IntWritable, Text> {
@Override
public int getPartition(IntWritable month, Text value, int i) {
return (month.get() - 1);
}
}
public static class SplitByMonthReducer extends Reducer<IntWritable, Text, NullWritable, Text> {
@Override
protected void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text txt : values) {
context.write(NullWritable.get(), txt);
}
}
}
public int run(String[] allArgs) throws Exception {
Job job = Job.getInstance(getConf());
job.setJarByClass(SplitByMonthMRJob.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapperClass(SplitByMonthMapper.class);
job.setPartitionerClass(SplitByMonthPartitioner.class);
job.setReducerClass(SplitByMonthReducer.class);
job.setNumReduceTasks(12);
String[] args = new GenericOptionsParser(getConf(), allArgs).getRemainingArgs();
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) {
try {
ToolRunner.run(new SplitByMonthMRJob(), args);
} catch (Exception e) {
e.printStackTrace();
}
}
}