先给一个飞行记录数据集,每行格式:
-
- <src>_<dest>_<time>
-
- 3 个部分由空格隔开
-
- src 和 dest 为两个字符串,中间没有空格
-
- 三部分表示 飞行起始地,飞行目的地,飞行时间
输入可能有噪音,如果一行不符合上述格式应当丢弃。
问题是对数据集进行 Map-Reduce 编程,输出:
- -<src>_<dest>_<cnt>_<avg_time>
- cnt 表示从 src 到 dest 的飞行记录综述,avg_time 表示这些飞行记录的平均飞行时间,注意src 和 dest 颠倒视为不同的记录。
本题目是根据国科大 (UCAS) 大数据系统的第二次作业来做的,本题得到了满分,运行命令如下,input file 是对应的 hdfs 上的输入文件,重要的是 mapper,combiner ,reducer 的思想。
$ hadoop jar ./Hw2Part1.jar <input file> <output directory>
运行代码部分,包含了标准的 javadoc 注释:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.StringTokenizer;
/**
* The {@code Hw2Part1} class represents the second homework of Part 1.
*
* <p>The {@code Hw2Part1} class can be used to process the input file
* and output the corresponding calculation results to the specified
* location.
*
* <p>This program mainly uses map-reduce technology to obtain the expected data.
*
* @author T2777
* @version 1.0
*/
public class Hw2Part1 {
/**
* The {@code ATimeCountMapper} class realizes the map function.
*
* <p>The {@code ATimeCountMapper} class processes the input line
* records and outputs the key value pairs needed for the corresponding
* results.
*/
public static class ATimeCountMapper extends Mapper<Object, Text, Text, Text> {
/**
* This value is used to store the key of the corresponding results.
*/
private Text record = new Text();
/**
* This value is used to store the value of the corresponding results.
*/
private Text cntAndTime = new Text();
/**
* Core function of the mapper to obtain the key value pairs needed for
* the corresponding results.
*
* @param key the line number of the input record.
* @param value the content of the record per line.
* @param context the context of the program.
* @throws IOException if there is an IOException.
* @throws InterruptedException if there is an InterruptedException.
*/
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
if (itr.countTokens() != 3) {
return;
}
String src = itr.nextToken();
String dst = itr.nextToken();
record.set(src + " " + dst);
int cnt = 1;
double time = 0;
try {
time = Double.valueOf(itr.nextToken());
} catch (NumberFormatException e) {
return;
}
cntAndTime.set(cnt + " " + time);
context.write(record, cntAndTime);
}
}
/**
* The {@code ATimeCountCombiner} class realizes the combiner function.
*
* <p>The {@code ATimeCountCombiner} class processes the input line
* records to calculate the count of record of the same key and the sum of the
* cost and output the key value pairs needed for the corresponding
* results.
*/
public static class ATimeCountCombiner extends Reducer<Text, Text, Text, Text> {
/**
* This value is used to store the value of the corresponding results
*/
private Text result = new Text();
/**
* Core function of the combiner to obtain the key value pairs needed for
* the corresponding results.
*
* @param key the key obtained from the mapper.
* @param values the values obtained from the mapper.
* @param context the context of the program.
* @throws IOException if there is an IOException.
* @throws InterruptedException if there is an InterruptedException.
*/
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int cnt = 0;
double time = 0;
for (Text value : values) {
StringTokenizer itr = new StringTokenizer(value.toString());
cnt += Integer.valueOf(itr.nextToken());
time += Double.valueOf(itr.nextToken());
}
result.set(cnt + " " + time);
context.write(key, result);
}
}
/**
* The {@code ATimeCountReducer} class realizes the reducer function.
*
* <p>The {@code ATimeCountReducer} class processes the input obtained from
* the combiner and output the key value pairs needed for the corresponding
* results.
*/
public static class ATimeCountReducer extends Reducer<Text, Text, Text, Text> {
/**
* This value is used to store the value of the corresponding results
*/
private Text result = new Text();
/**
* The core function of the reducer to process the input from the combiner
* to calculate the count of the record which has the same key and the average
* cost, and output the key value pairs needed for the corresponding results.
*
* @param key the key obtained from the combiner.
* @param values the values obtained from the combiner.
* @param context the context of the program.
* @throws IOException if there is an IOException.
* @throws InterruptedException if there is an InterruptedException.
*/
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int cnt = 0;
double time = 0;
for (Text value : values) {
StringTokenizer itr = new StringTokenizer(value.toString());
cnt += Integer.valueOf(itr.nextToken());
time += Double.valueOf(itr.nextToken());
}
DecimalFormat decimalFormat = new DecimalFormat("#.000");
result.set(cnt + " " + decimalFormat.format(time / cnt));
context.write(key, result);
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: hw2part1 <in> [<in>...] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "average time count");
job.setJarByClass(Hw2Part1.class);
job.setMapperClass(Hw2Part1.ATimeCountMapper.class);
job.setCombinerClass(Hw2Part1.ATimeCountCombiner.class);
job.setReducerClass(Hw2Part1.ATimeCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job,
new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}