mapreduce Partitioner

description

航空数据由头部标题和数据部分组成,数据部分每行为一条记录,每列标示的信息与表头相对应,记录包含了某一年每个月的飞机飞行数据.现需将该文件中的记录按照月份划分为12个文件.

file content

Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
1995,1,6,5,657,645,952,937,UA,482,N7298U,115,112,83,15,12,ORD,PHL,678,7,25,0,NA,0,NA,NA,NA,NA,NA
1995,1,7,6,648,645,938,937,UA,482,N7449U,110,112,88,1,3,ORD,PHL,678,5,17,0,NA,0,NA,NA,NA,NA,NA
1995,1,8,7,649,645,932,937,UA,482,N7453U,103,112,83,-5,4,ORD,PHL,678,3,17,0,NA,0,NA,NA,NA,NA,NA
1995,1,9,1,645,645,928,937,UA,482,N7288U,103,112,84,-9,0,ORD,PHL,678,3,16,0,NA,0,NA,NA,NA,NA,NA
1995,1,10,2,645,645,931,937,UA,482,N7275U,106,112,82,-6,0,ORD,PHL,678,6,18,0,NA,0,NA,NA,NA,NA,NA
1995,1,11,3,646,645,929,937,UA,482,N7281U,103,112,85,-8,1,ORD,PHL,678,5,13,0,NA,0,NA,NA,NA,NA,NA
1995,1,12,4,NA,645,NA,937,UA,482,UNKNOW,NA,112,45,NA,NA,ORD,PHL,678,6,10,1,NA,0,NA,NA,NA,NA,NA
1995,1,13,5,644,645,953,937,UA,482,N7257U,129,112,110,16,-1,ORD,PHL,678,5,14,0,NA,0,NA,NA,NA,NA,NA
1995,1,14,6,644,645,938,937,UA,482,N7282U,114,112,94,1,-1,ORD,PHL,678,5,15,0,NA,0,NA,NA,NA,NA,NA

sourece code

package org.apress.prohadoop.c3;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

/*
 * 将航空数据文件根据月份划分成多个文件
 * 每个月份对应一个文件
 */
public class SplitByMonthMRJob extends Configured implements Tool {

    public static class SplitByMonthMapper extends Mapper<LongWritable, Text, IntWritable, Text> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //跳过文件头部标题
            if (AirlineDataUtils.isHeader(value)) {
                return;
            }
            String[] fields = value.toString().split(",");
            int month = Integer.parseInt(AirlineDataUtils.getMonth(fields));
            context.write(new IntWritable(month), value);
        }
    }

    public static class SplitByMonthPartitioner extends Partitioner<IntWritable, Text> {
        @Override
        public int getPartition(IntWritable month, Text value, int i) {
            return (month.get() - 1);
        }
    }

    public static class SplitByMonthReducer extends Reducer<IntWritable, Text, NullWritable, Text> {
        @Override
        protected void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            for (Text txt : values) {
                context.write(NullWritable.get(), txt);
            }
        }
    }

    public int run(String[] allArgs) throws Exception {
        Job job = Job.getInstance(getConf());
        job.setJarByClass(SplitByMonthMRJob.class);

        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setMapperClass(SplitByMonthMapper.class);
        job.setPartitionerClass(SplitByMonthPartitioner.class);
        job.setReducerClass(SplitByMonthReducer.class);

        job.setNumReduceTasks(12);

        String[] args = new GenericOptionsParser(getConf(), allArgs).getRemainingArgs();
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        return job.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) {
        try {
            ToolRunner.run(new SplitByMonthMRJob(), args);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值