在网站的数据统计中,有这样一种情况,即统计某个用户发表的评论数、第一次发表评论的时间和最后一次发表评论的时间。下面代码就是解决comments.xml的这个问题。代码如下:
package mrdp.ch2;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;
import mrdp.utils.MRDPUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class MinMaxCountDriver {
public static class SOMinMaxCountMapper extends
Mapper<Object, Text, Text, MinMaxCountTuple> {
// Our output key and value Writables
private Text outUserId = new Text();
private MinMaxCountTuple outTuple = new MinMaxCountTuple();
// This object will format the creation date string into a Date object
private final static SimpleDateFormat frmt = new SimpleDateFormat(
"yyyy-MM-dd'T'HH:mm:ss.SSS");
@Override
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
// Parse the input string into a nice map
Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString());
// Grab the "CreationDate" field since it is what we are finding
// the min and max value of
String strDate = parsed.get("CreationDate");
// Grab the “UserID” since it is what we are grouping by
String userId = parsed.get("UserId");
// .get will return null if the key is not there
if (strDate == null || userId == null) {
// skip this record
return;
}
try {
// Parse the string into a Date object
Date creationDate = frmt.parse(strDate);
// Set the minimum and maximum date values to the creationDate
outTuple.setMin(creationDate);
outTuple.setMax(creationDate);
// Set the comment count to 1
outTuple.setCount(1);
// Set our user ID as the output key
outUserId.set(userId);
// Write out the user ID with min max dates and count
context.write(outUserId, outTuple);
} catch (ParseException e) {
// An error occurred parsing the creation Date string
// skip this record
}
}
}
public static class SOMinMaxCountReducer extends
Reducer<Text, MinMaxCountTuple, Text, MinMaxCountTuple> {
private MinMaxCountTuple result = new MinMaxCountTuple();
@Override
public void reduce(Text key, Iterable<MinMaxCountTuple> values,
Context context) throws IOException, InterruptedException {
// Initialize our result
result.setMin(null);
result.setMax(null);
int sum = 0;
// Iterate through all input values for this key
for (MinMaxCountTuple val : values) {
// If the value's min is less than the result's min
// Set the result's min to value's
if (result.getMin() == null
|| val.getMin().compareTo(result.getMin()) < 0) {
result.setMin(val.getMin());
}
// If the value's max is less than the result's max
// Set the result's max to value's
if (result.getMax() == null
|| val.getMax().compareTo(result.getMax()) > 0) {
result.setMax(val.getMax());
}
// Add to our sum the count for val
sum += val.getCount();
}
// Set our count to the number of input values
result.setCount(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: MinMaxCountDriver <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "StackOverflow Comment Date Min Max Count");
job.setJarByClass(MinMaxCountDriver.class);
job.setMapperClass(SOMinMaxCountMapper.class);
job.setCombinerClass(SOMinMaxCountReducer.class);
job.setReducerClass(SOMinMaxCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(MinMaxCountTuple.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class MinMaxCountTuple implements Writable {
private Date min = new Date();
private Date max = new Date();
private long count = 0;
private final static SimpleDateFormat frmt = new SimpleDateFormat(
"yyyy-MM-dd'T'HH:mm:ss.SSS");
public Date getMin() {
return min;
}
public void setMin(Date min) {
this.min = min;
}
public Date getMax() {
return max;
}
public void setMax(Date max) {
this.max = max;
}
public long getCount() {
return count;
}
public void setCount(long count) {
this.count = count;
}
@Override
public void readFields(DataInput in) throws IOException {
min = new Date(in.readLong());
max = new Date(in.readLong());
count = in.readLong();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(min.getTime());
out.writeLong(max.getTime());
out.writeLong(count);
}
@Override
public String toString() {
return frmt.format(min) + "\t" + frmt.format(max) + "\t" + count;
}
}
}
这里的mrdp.utils.MRDPUtils包的代码在第一篇中已经给出。
这里最重要的是自己重写了writable函数,自己定义了value类型。有时间我另开一篇博客介绍下writable函数。
map阶段不做任何比较和计算,只是简单的对comments.xml进行解析,然后把每次评论的时间解析出来,并把count赋值为1.如解析下一列
<row Id="1784" PostId="883" Text="Perfect distinction. I've made a note and agree entirely." CreationDate="2012-02-08T21:51:05.223" UserId="46" />
mapper会把UserID做为key,另外一个outTuple作为value,格式为(min,max,count)即(2012-02-08T21:51:05.223,2012-02-08T21:51:05.223,1)
compiler阶段直接调用的reduce函数,做中间处理。
reducer阶段计算我们需要的数据,即求最大值,最小值,总数。reducer的时间较简单,就是把每个uid对应的value循环取出,然后一一做比较,并计算count.
整个流程如下图:
得到的部分结果如下:
jpan@jpan-Beijing:~/Mywork/mapreducepatterns/testdata$ hadoop fs -cat output2/part-r-00000
10 2011-02-14T18:04:38.763 2012-07-10T22:57:00.757 8
101 2011-04-01T03:02:45.083 2011-04-01T06:02:33.307 2
10119 2012-02-08T13:54:38.623 2012-04-12T23:43:14.810 8
1057 2011-06-17T19:59:33.013 2011-06-17T19:59:33.013 1
10691 2012-04-19T01:15:44.573 2012-05-11T05:47:36.517 2
10872 2012-06-14T15:36:26.527 2012-06-14T15:45:43.347 4
10921 2011-12-07T18:08:04.583 2011-12-07T18:08:04.583 1
11 2011-05-06T02:51:50.370 2011-05-06T14:46:31.483 3
110 2010-08-12T14:52:09.830 2010-08-12T14:52:09.830 1
1118 2011-02-17T10:27:48.623 2011-02-25T09:25:09.597 2
11498 2011-12-30T11:09:58.057 2011-12-30T11:09:58.057 1
11682 2012-01-04T21:48:39.267 2012-01-04T21:48:39.267 1