Hadoop 计算飞行次数与平均飞行时间

最新推荐文章于 2022-02-17 19:26:32 发布

T2777

最新推荐文章于 2022-02-17 19:26:32 发布

阅读量324

点赞数

分类专栏：后端文章标签： hadoop 大数据 map-reduce UCAS

本文链接：https://blog.csdn.net/T2777/article/details/118540267

版权

后端专栏收录该内容

10 篇文章 0 订阅

订阅专栏

先给一个飞行记录数据集，每行格式：

- <src>_<dest>_<time>
- 3 个部分由空格隔开
- src 和 dest 为两个字符串，中间没有空格
- 三部分表示飞行起始地，飞行目的地，飞行时间

输入可能有噪音，如果一行不符合上述格式应当丢弃。

问题是对数据集进行 Map-Reduce 编程，输出：

-<src>_<dest>_<cnt>_<avg_time>
cnt 表示从 src 到 dest 的飞行记录综述，avg_time 表示这些飞行记录的平均飞行时间，注意src 和 dest 颠倒视为不同的记录。

本题目是根据国科大 (UCAS) 大数据系统的第二次作业来做的，本题得到了满分，运行命令如下，input file 是对应的 hdfs 上的输入文件，重要的是 mapper，combiner ，reducer 的思想。

$ hadoop jar ./Hw2Part1.jar <input file> <output directory>

运行代码部分，包含了标准的 javadoc 注释：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.text.DecimalFormat;
import java.util.StringTokenizer;

/**
 * The {@code Hw2Part1} class represents the second homework of Part 1.
 *
 * <p>The {@code Hw2Part1} class can be used to process the input file
 * and output the corresponding calculation results to the specified
 * location.
 *
 * <p>This program mainly uses map-reduce technology to obtain the expected data.
 *
 * @author T2777
 * @version 1.0
 */

public class Hw2Part1 {

    /**
     * The {@code ATimeCountMapper} class realizes the map function.
     *
     * <p>The {@code ATimeCountMapper} class processes the input line
     * records and outputs the key value pairs needed for the corresponding
     * results.
     */
    public static class ATimeCountMapper extends Mapper<Object, Text, Text, Text> {

        /**
         * This value is used to store the key of the corresponding results.
         */
        private Text record = new Text();
        /**
         * This value is used to store the value of the corresponding results.
         */
        private Text cntAndTime = new Text();

        /**
         * Core function of the mapper to obtain the key value pairs needed for
         * the corresponding results.
         *
         * @param key the line number of the input record.
         * @param value the content of the record per line.
         * @param context the context of the program.
         * @throws IOException if there is an IOException.
         * @throws InterruptedException if there is an InterruptedException.
         */
        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());
            if (itr.countTokens() != 3) {
                return;
            }
            String src = itr.nextToken();
            String dst = itr.nextToken();
            record.set(src + " " + dst);
            int cnt = 1;
            double time = 0;
            try {
                time = Double.valueOf(itr.nextToken());
            } catch (NumberFormatException e) {
                return;
            }
            cntAndTime.set(cnt + " " + time);
            context.write(record, cntAndTime);
        }
    }
    /**
     * The {@code ATimeCountCombiner} class realizes the combiner function.
     *
     * <p>The {@code ATimeCountCombiner} class processes the input line
     * records to calculate the count of record of the same key and the sum of the
     * cost and output the key value pairs needed for the corresponding
     * results.
     */
    public static class ATimeCountCombiner extends Reducer<Text, Text, Text, Text> {
        /**
         * This value is used to store the value of the corresponding results
         */
        private Text result = new Text();

        /**
         * Core function of the combiner to obtain the key value pairs needed for
         * the corresponding results.
         *
         * @param key the key obtained from the mapper.
         * @param values the values obtained from the mapper.
         * @param context the context of the program.
         * @throws IOException if there is an IOException.
         * @throws InterruptedException if there is an InterruptedException.
         */
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            int cnt = 0;
            double time = 0;
            for (Text value : values) {
                StringTokenizer itr = new StringTokenizer(value.toString());
                cnt += Integer.valueOf(itr.nextToken());
                time += Double.valueOf(itr.nextToken());
            }
            result.set(cnt + " " + time);
            context.write(key, result);
        }
    }
    /**
     * The {@code ATimeCountReducer} class realizes the reducer function.
     *
     * <p>The {@code ATimeCountReducer} class processes the input obtained from
     * the combiner and output the key value pairs needed for the corresponding
     * results.
     */
    public static class ATimeCountReducer extends Reducer<Text, Text, Text, Text> {
        /**
         * This value is used to store the value of the corresponding results
         */
        private Text result = new Text();

        /**
         * The core function of the reducer to process the input from the combiner
         * to calculate the count of the record which has the same key and the average
         * cost, and output the key value pairs needed for the corresponding results.
         *
         * @param key the key obtained from the combiner.
         * @param values the values obtained from the combiner.
         * @param context the context of the program.
         * @throws IOException if there is an IOException.
         * @throws InterruptedException if there is an InterruptedException.
         */
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            int cnt = 0;
            double time = 0;
            for (Text value : values) {
                StringTokenizer itr = new StringTokenizer(value.toString());
                cnt += Integer.valueOf(itr.nextToken());
                time += Double.valueOf(itr.nextToken());
            }
            DecimalFormat decimalFormat = new DecimalFormat("#.000");
            result.set(cnt + " " + decimalFormat.format(time / cnt));
            context.write(key, result);
        }
    }


    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: hw2part1 <in> [<in>...] <out>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf, "average time count");

        job.setJarByClass(Hw2Part1.class);

        job.setMapperClass(Hw2Part1.ATimeCountMapper.class);
        job.setCombinerClass(Hw2Part1.ATimeCountCombiner.class);
        job.setReducerClass(Hw2Part1.ATimeCountReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        for (int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }

        FileOutputFormat.setOutputPath(job,
                new Path(otherArgs[otherArgs.length - 1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }


}