Java lab booklet
Adding a Combiner
The combiner will greatly decrease the number of key/value pairs distributed across the network between the mappers and reducers.
WordCount main()中增加
job.setCombinerClass(IntSumReducer.class);
Computing the Average of a Collection of Numbers
The MapReduce job will compute and output the average median income in the year 2000 of each of the 50 states and the District of Columbia.
Abbeville, SC,45001,6581,7471,6787,195278,302280,29673,40460,3042,3294
Acadia, LA,22001,13658,15450,16308,338561,618949,24788,40061,5686,5975
Accomack, VA,51001,9401,11507,10857,238824,444818,25404,38656,4720,5319
Notice the first value in each row is a county name, followed by the state. The third value is a unique ID for the county. The remaining values represent median incomes from various years. For example, the 10th value in each row is the median household income for that county for the year 2000. This is the column you are going to compute the average of for each state.
package average;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class AverageJob extends Configured implements Tool {
public static class AveragePartitioner extends Partitioner<Text, Text> {
@Override
public int getPartition(Text key, Text value, int numPartitions) {
if (numPartitions == 1) {
return 0;
}
return (key.hashCode()&Integer.MAX_VALUE) % numPartitions;
}
}
public enum Counters{MAP, COMBINE, REDUCE}
public static class AverageMapper extends Mapper<LongWritable, Text, Text, Text> {
private Text outputKey = new Text();
private Text outputValue = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] words = StringUtils.split(value.toString(), '\\', ',');
for (int i = 0; i < words.length; i++) {
//State column.
if (i == 1) {
outputKey.set(words[i].trim());
}
//Househould income column.
if (i == 9) {
outputValue.set(words[i].trim() + ",1");
}
}
context.getCounter(Counters.MAP).increment(1);
context.write(outputKey, outputValue);
}
@Override
protected void cleanup(Context context)
throws IOException, InterruptedException {
}
}
public static class AverageCombiner extends Reducer<Text, Text, Text, Text> {
private Text outputValue = new Text();
private long sum = 0;
private int count = 0;
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String[] tmp = new String[2];
for (Text value : values) {
tmp = StringUtils.split(value.toString(), ',');
sum += Long.parseLong(tmp[0]);
count += Integer.parseInt(tmp[1]);
}
outputValue.set(sum + "," + count);
context.getCounter(Counters.COMBINE).increment(1);
context.write(key, outputValue);
sum = 0;
count = 0;
}
@Override
protected void cleanup(Context context)
throws IOException, InterruptedException {
}
}
public static class AverageReducer extends Reducer<Text, Text, Text, DoubleWritable> {
private DoubleWritable outputValue = new DoubleWritable();
private double sum = 0;
private int count = 0;
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String[] tmp = new String[2];
for (Text value : values) {
tmp = StringUtils.split(value.toString(), ',');
sum += Long.parseLong(tmp[0]);
count += Integer.parseInt(tmp[1]);
}
outputValue.set(sum/count);
context.getCounter(Counters.REDUCE).increment(1);
context.write(key, outputValue);
sum = 0;
count = 0;
}
@Override
protected void cleanup(Context context)
throws IOException, InterruptedException {
}
}
@Override
public int run(String[] arg0) throws Exception {
Configuration conf = super.getConf();
Job job = Job.getInstance(conf, "AverageJob");
job.setJarByClass(AverageJob.class);
Path out = new Path("counties/output");
FileInputFormat.setInputPaths(job, "counties");
FileOutputFormat.setOutputPath(job, out);
out.getFileSystem(conf).delete(out, true);
job.setMapperClass(AverageMapper.class);
job.setReducerClass(AverageReducer.class);
job.setCombinerClass(AverageCombiner.class);
job.setPartitionerClass(AveragePartitioner.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(5);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) {
int result = 0;
try {
result = ToolRunner.run(new Configuration(), new AverageJob(), args);
} catch (Exception e) {
e.printStackTrace();
}
System.exit(result);
}
}
Writing a Custom Partitioner
The Average MapReduce job will execute with five Reducers that are sent an evenly-distributed number of key/value pairs.
public static class AveragePartitioner extends Partitioner<Text, Text> {
@Override
public int getPartition(Text key, Text value, int numPartitions) {
if (numPartitions == 1) {
return 0;
}
return (key.hashCode()&Integer.MAX_VALUE) % numPartitions;
}
}
job.setNumReduceTasks(5);
Writing a Custom Output Format
A MapReduce jobs that outputs the growth (or loss) of stock dividends.
exchange,stock_symbol,date,dividends
NYSE,AIT,2009-11-12,0.15
NYSE,AIT,2009-08-12,0.15
NYSE,AIT,2009-05-13,0.15
NYSE,AIT,2009-02-11,0.15
package customsort;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DividendOutputFormat extends FileOutputFormat<NullWritable, DividendChange> {
@Override
public RecordWriter<NullWritable, DividendChange> getRecordWriter(
TaskAttemptContext job) throws IOException, InterruptedException {
int partition = job.getTaskAttemptID().getTaskID().getId();
Path outputDir = FileOutputFormat.getOutputPath(job);
Path filename = new Path(outputDir.getName() + Path.SEPARATOR + job.getJobName() + "_" + partition);
FileSystem fs = filename.getFileSystem(job.getConfiguration());
FSDataOutputStream dos = fs.create(filename);
return new DividendRecordWriter(dos);
}
}
package customsort;
import java.io.DataOutputStream;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
public class DividendRecordWriter extends RecordWriter<NullWritable, DividendChange> {
public final String SEPERATOR = ",";
private DataOutputStream out;
public DividendRecordWriter(DataOutputStream out) {
this.out = out;
}
@Override
public void write(NullWritable key, DividendChange value)
throws IOException, InterruptedException {
StringBuilder result = new StringBuilder();
result.append(value.getSymbol());
result.append(SEPERATOR);
result.append(value.getDate());
result.append(SEPERATOR);
result.append(value.getChange());
result.append("\n");
out.write(result.toString().getBytes());
}
@Override
public void close(TaskAttemptContext context) throws IOException,
InterruptedException {
out.close();
}
}
package customsort;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class DividendJob extends Configured implements Tool {
public static class DividendGrowthMapper extends Mapper<LongWritable, Text, Stock, DoubleWritable> {
private Stock outputKey = new Stock();
private DoubleWritable outputValue = new DoubleWritable();
private final String EXCHANGE = "exchange";
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String [] words = StringUtils.split(value.toString(),'\\',',');
if(EXCHANGE.equals(words[0])) {
return;
}
outputKey.setSymbol(words[1]);
outputKey.setDate(words[2]);
outputValue.set(Double.parseDouble(words[3]));
context.write(outputKey, outputValue);
}
}
public static class StockPartitioner extends Partitioner<Stock, DoubleWritable> {
@Override
public int getPartition(Stock key, DoubleWritable value, int numReduceTasks) {
char firstLetter = key.getSymbol().trim().charAt(0);
return (firstLetter - 'A') % numReduceTasks;
}
}
public static class DividendGrowthReducer extends Reducer<Stock, DoubleWritable, NullWritable, DividendChange> {
private NullWritable outputKey = NullWritable.get();
private DividendChange outputValue = new DividendChange();
private MultipleOutputs < NullWritable , DividendChange > mos;
@Override
protected void reduce(Stock key, Iterable<DoubleWritable> values, Context context)
throws IOException, InterruptedException {
double previousDividend = 0.0;
for(DoubleWritable dividend : values) {
double currentDividend = dividend.get();
double growth = currentDividend - previousDividend;
if(Math.abs(growth) > 0.000001) {
outputValue.setSymbol(key.getSymbol());
outputValue.setDate(key.getDate());
outputValue.setChange(growth);
if (growth > 0) {
mos.write("positive", outputKey, outputValue, "pos");
}else {
mos.write("negative", outputKey, outputValue, "nes");
}
//context.write(outputKey, outputValue);
previousDividend = currentDividend;
}
}
}
@Override
protected void setup(
Reducer<Stock, DoubleWritable, NullWritable, DividendChange>.Context context)
throws IOException, InterruptedException {
mos = new MultipleOutputs<NullWritable, DividendChange>(context);
super.setup(context);
}
@Override
protected void cleanup(
Reducer<Stock, DoubleWritable, NullWritable, DividendChange>.Context context)
throws IOException, InterruptedException {
mos.close();
super.cleanup(context);
}
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = super.getConf();
Job job = Job.getInstance(conf, "DividendJob");
job.setJarByClass(DividendJob.class);
Path out = new Path("growth");
FileInputFormat.setInputPaths(job, new Path("dividends"));
FileOutputFormat.setOutputPath(job, out);
out.getFileSystem(conf).delete(out, true);
job.setMapperClass(DividendGrowthMapper.class);
job.setReducerClass(DividendGrowthReducer.class);
job.setPartitionerClass(StockPartitioner.class);
job.setGroupingComparatorClass(StockGroupComparator.class);
job.setInputFormatClass(TextInputFormat.class);
//job.setOutputFormatClass(DividendOutputFormat.class);
MultipleOutputs.addNamedOutput(job, "positive", TextOutputFormat.class, NullWritable.class, DividendChange.class);
MultipleOutputs.addNamedOutput(job, "negative", TextOutputFormat.class, NullWritable.class, DividendChange.class);
//Drop off default empty files.
LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(DividendChange.class);
job.setMapOutputKeyClass(Stock.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setNumReduceTasks(3);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) {
int result = 0;
try {
result = ToolRunner.run(new Configuration(), new DividendJob(), args);
} catch (Exception e) {
e.printStackTrace();
}
System.exit(result);
}
}
package customsort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class DividendChange implements Writable {
private String symbol;
private String date;
private double change;
public String getSymbol() {
return symbol;
}
public void setSymbol(String symbol) {
this.symbol = symbol;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public double getChange() {
return change;
}
public void setChange(double change) {
this.change = change;
}
@Override
public String toString() {
return symbol + "\t" + date + "\t" + change;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(symbol);
out.writeUTF(date);
out.writeDouble(change);
}
@Override
public void readFields(DataInput in) throws IOException {
symbol = in.readUTF();
date = in.readUTF();
change = in.readDouble();
}
}
package customsort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class Stock implements WritableComparable<Stock> {
private String symbol;
private String date;
public String getSymbol() {
return symbol;
}
public void setSymbol(String symbol) {
this.symbol = symbol;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(symbol);
out.writeUTF(date);
}
@Override
public void readFields(DataInput in) throws IOException {
symbol = in.readUTF();
date = in.readUTF();
}
@Override
public int compareTo(Stock stock) {
int response = this.symbol.compareTo(stock.symbol);
if (response != 0) {
return response;
}else {
response = this.date.compareTo(stock.date);
return response;
}
}
}
package customsort;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class StockGroupComparator extends WritableComparator {
protected StockGroupComparator(){
super(Stock.class, true);
}
@SuppressWarnings("rawtypes")
@Override
public int compare(WritableComparable a, WritableComparable b) {
Stock lhs = (Stock)a;
Stock rhs = (Stock)b;
return lhs.getSymbol().compareTo(rhs.getSymbol());
}
}