Hadoop 计算飞行次数与平均飞行时间

先给一个飞行记录数据集,每行格式:

  • - <src>_<dest>_<time>

  • - 3 个部分由空格隔开

  • - src 和 dest 为两个字符串,中间没有空格

  • - 三部分表示 飞行起始地,飞行目的地,飞行时间

输入可能有噪音,如果一行不符合上述格式应当丢弃。

问题是对数据集进行 Map-Reduce 编程,输出:

  • -<src>_<dest>_<cnt>_<avg_time>
  • cnt 表示从 src 到 dest 的飞行记录综述,avg_time 表示这些飞行记录的平均飞行时间,注意src 和 dest 颠倒视为不同的记录。

本题目是根据国科大 (UCAS) 大数据系统的第二次作业来做的,本题得到了满分,运行命令如下,input file 是对应的 hdfs 上的输入文件,重要的是 mapper,combiner ,reducer 的思想。

$ hadoop jar ./Hw2Part1.jar <input file> <output directory>

运行代码部分,包含了标准的 javadoc 注释:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.text.DecimalFormat;
import java.util.StringTokenizer;

/**
 * The {@code Hw2Part1} class represents the second homework of Part 1.
 *
 * <p>The {@code Hw2Part1} class can be used to process the input file
 * and output the corresponding calculation results to the specified
 * location.
 *
 * <p>This program mainly uses map-reduce technology to obtain the expected data.
 *
 * @author T2777
 * @version 1.0
 */

public class Hw2Part1 {

    /**
     * The {@code ATimeCountMapper} class realizes the map function.
     *
     * <p>The {@code ATimeCountMapper} class processes the input line
     * records and outputs the key value pairs needed for the corresponding
     * results.
     */
    public static class ATimeCountMapper extends Mapper<Object, Text, Text, Text> {

        /**
         * This value is used to store the key of the corresponding results.
         */
        private Text record = new Text();
        /**
         * This value is used to store the value of the corresponding results.
         */
        private Text cntAndTime = new Text();

        /**
         * Core function of the mapper to obtain the key value pairs needed for
         * the corresponding results.
         *
         * @param key the line number of the input record.
         * @param value the content of the record per line.
         * @param context the context of the program.
         * @throws IOException if there is an IOException.
         * @throws InterruptedException if there is an InterruptedException.
         */
        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());
            if (itr.countTokens() != 3) {
                return;
            }
            String src = itr.nextToken();
            String dst = itr.nextToken();
            record.set(src + " " + dst);
            int cnt = 1;
            double time = 0;
            try {
                time = Double.valueOf(itr.nextToken());
            } catch (NumberFormatException e) {
                return;
            }
            cntAndTime.set(cnt + " " + time);
            context.write(record, cntAndTime);
        }
    }
    /**
     * The {@code ATimeCountCombiner} class realizes the combiner function.
     *
     * <p>The {@code ATimeCountCombiner} class processes the input line
     * records to calculate the count of record of the same key and the sum of the
     * cost and output the key value pairs needed for the corresponding
     * results.
     */
    public static class ATimeCountCombiner extends Reducer<Text, Text, Text, Text> {
        /**
         * This value is used to store the value of the corresponding results
         */
        private Text result = new Text();

        /**
         * Core function of the combiner to obtain the key value pairs needed for
         * the corresponding results.
         *
         * @param key the key obtained from the mapper.
         * @param values the values obtained from the mapper.
         * @param context the context of the program.
         * @throws IOException if there is an IOException.
         * @throws InterruptedException if there is an InterruptedException.
         */
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            int cnt = 0;
            double time = 0;
            for (Text value : values) {
                StringTokenizer itr = new StringTokenizer(value.toString());
                cnt += Integer.valueOf(itr.nextToken());
                time += Double.valueOf(itr.nextToken());
            }
            result.set(cnt + " " + time);
            context.write(key, result);
        }
    }
    /**
     * The {@code ATimeCountReducer} class realizes the reducer function.
     *
     * <p>The {@code ATimeCountReducer} class processes the input obtained from
     * the combiner and output the key value pairs needed for the corresponding
     * results.
     */
    public static class ATimeCountReducer extends Reducer<Text, Text, Text, Text> {
        /**
         * This value is used to store the value of the corresponding results
         */
        private Text result = new Text();

        /**
         * The core function of the reducer to process the input from the combiner
         * to calculate the count of the record which has the same key and the average
         * cost, and output the key value pairs needed for the corresponding results.
         *
         * @param key the key obtained from the combiner.
         * @param values the values obtained from the combiner.
         * @param context the context of the program.
         * @throws IOException if there is an IOException.
         * @throws InterruptedException if there is an InterruptedException.
         */
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            int cnt = 0;
            double time = 0;
            for (Text value : values) {
                StringTokenizer itr = new StringTokenizer(value.toString());
                cnt += Integer.valueOf(itr.nextToken());
                time += Double.valueOf(itr.nextToken());
            }
            DecimalFormat decimalFormat = new DecimalFormat("#.000");
            result.set(cnt + " " + decimalFormat.format(time / cnt));
            context.write(key, result);
        }
    }


    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: hw2part1 <in> [<in>...] <out>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf, "average time count");

        job.setJarByClass(Hw2Part1.class);

        job.setMapperClass(Hw2Part1.ATimeCountMapper.class);
        job.setCombinerClass(Hw2Part1.ATimeCountCombiner.class);
        job.setReducerClass(Hw2Part1.ATimeCountReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        for (int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }

        FileOutputFormat.setOutputPath(job,
                new Path(otherArgs[otherArgs.length - 1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }


}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值