MapReduce设计模式:求最小值、最大值和计数的MapReduce设计模型如下:
MinMaxCount 类如下:
package com;
import java.io.IOException;import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
public class MinMaxCount extends Configured implements Tool{
public static class MinMaxCountMapper extends Mapper<Object,Text,Text,MinMaxCountTuple>{
private MinMaxCountTuple outTuple = new MinMaxCountTuple();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
String strValue = value.toString();
String[] arrValue = strValue.split("\t");
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date date;
try {
date = sdf.parse(arrValue[1]);
outTuple.setMax(date);
outTuple.setMin(date);
outTuple.setCount(1);
Text UserID = new Text(arrValue[0]);
context.write(UserID, outTuple);
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public static class MinMaxCountReducer extends Reducer<Text, MinMaxCountTuple,Text,MinMaxCountTuple>{
private MinMaxCountTuple result = new MinMaxCountTuple();
public void reduce(Text key, Iterable<MinMaxCountTuple> values,Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
result.setMax(null);
result.setMin(null);
result.setCount(0);
int sum = 0;
for(MinMaxCountTuple val : values)
{
if(result.getMin() == null || val.getMin().compareTo(result.getMin())< 0 )
{
result.setMin(val.getMin());
}
if(result.getMax() == null || val.getMax().compareTo(result.getMax())>0)
{
result.setMax(val.getMax());
}
sum+=val.getCount();
}
result.setCount(sum);
context.write(key, result);
}
}
@Override
public int run(String[] arg0) throws Exception {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "MinMaxCount");
job.setJarByClass(MinMaxCount.class);
job.setMapperClass(MinMaxCountMapper.class);
job.setReducerClass(MinMaxCountReducer.class);
job.setCombinerClass(MinMaxCountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(MinMaxCountTuple.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(MinMaxCountTuple.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path("hdfs://192.168.26.131:9000/design/minmaxcount/in/minmaxcount"));
Path out_path = new Path("hdfs://192.168.26.131:9000/design/minmaxcount/out");
FileSystem.get(conf).delete(out_path, true);
FileOutputFormat.setOutputPath(job, out_path);
//job.setNumReduceTasks(0);
job.waitForCompletion(true);
return 0;
}
}
MinMaxCountTuple 类如下:
package com;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.hadoop.io.Writable;
public class MinMaxCountTuple implements Writable {
private Date min = new Date();
private Date max = new Date();
private long count = 0;
public Date getMin() {
return min;
}
public void setMin(Date min) {
this.min = min;
}
public Date getMax() {
return max;
}
public void setMax(Date max) {
this.max = max;
}
public long getCount() {
return count;
}
public void setCount(long count) {
this.count = count;
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
System.out.println("readFields readFields readFields");
min = new Date(in.readLong());
max = new Date(in.readLong());
count = in.readLong();
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
System.out.println("write write write");
out.writeLong(min.getTime());
out.writeLong(max.getTime());
out.writeLong(count);
}
public String toString(){
System.out.println("toString toString toString");
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
return sdf.format(min)+"\t"+sdf.format(max)+"\t"+count;
}
}
测试函数类如下:
package com;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ToolRunner;
public class Test_MinMaxCount {
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
ToolRunner.run(new Configuration(), new MinMaxCount(), args);
//ToolRunner.run(new Configuration(), new Test(), args);
}
}
输入的测试数据如下:
1001 2012-10-12 10:11:12
1002 2013-2-9 3:9:8
1002 2013-9-10 10:3:21
1001 2013-2-9 13:19:18
1003 2011-1-19 12:21:12
1003 2015-9-10 11:3:2
1003 2012-2-19 12:21:12
1003 2015-12-11 17:3:12
1002 2011-12-9 13:9:13
1002 2001-9-10 10:3:2
1001 2013-12-19 3:1:1
程序输出结果为:
1001 2012-10-12 10:11:12 2013-12-19 03:01:01 3
1002 2001-09-10 10:03:02 2013-09-10 10:03:21 4
1003 2011-01-19 12:21:12 2015-12-11 17:03:12 4