mapreduce编程实例(2)-求最大值和最小值

最新推荐文章于 2022-11-16 14:14:06 发布

pan12jian

最新推荐文章于 2022-11-16 14:14:06 发布

阅读量4.2k

点赞数

分类专栏： hadoop 文章标签： hadoop 实例 java stackoverflow

本文链接：https://blog.csdn.net/pan12jian/article/details/26385511

版权

hadoop 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

在网站的数据统计中，有这样一种情况，即统计某个用户发表的评论数、第一次发表评论的时间和最后一次发表评论的时间。下面代码就是解决comments.xml的这个问题。代码如下：

package mrdp.ch2;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;

import mrdp.utils.MRDPUtils;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class MinMaxCountDriver {

	public static class SOMinMaxCountMapper extends
			Mapper<Object, Text, Text, MinMaxCountTuple> {
		// Our output key and value Writables
		private Text outUserId = new Text();
		private MinMaxCountTuple outTuple = new MinMaxCountTuple();

		// This object will format the creation date string into a Date object
		private final static SimpleDateFormat frmt = new SimpleDateFormat(
				"yyyy-MM-dd'T'HH:mm:ss.SSS");

		@Override
		public void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {

			// Parse the input string into a nice map
			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString());

			// Grab the "CreationDate" field since it is what we are finding
			// the min and max value of
			String strDate = parsed.get("CreationDate");

			// Grab the “UserID” since it is what we are grouping by
			String userId = parsed.get("UserId");

			// .get will return null if the key is not there
			if (strDate == null || userId == null) {
				// skip this record
				return;
			}

			try {
				// Parse the string into a Date object
				Date creationDate = frmt.parse(strDate);

				// Set the minimum and maximum date values to the creationDate
				outTuple.setMin(creationDate);
				outTuple.setMax(creationDate);

				// Set the comment count to 1
				outTuple.setCount(1);

				// Set our user ID as the output key
				outUserId.set(userId);

				// Write out the user ID with min max dates and count
				context.write(outUserId, outTuple);
			} catch (ParseException e) {
				// An error occurred parsing the creation Date string
				// skip this record
			}
		}
	}

	public static class SOMinMaxCountReducer extends
			Reducer<Text, MinMaxCountTuple, Text, MinMaxCountTuple> {
		private MinMaxCountTuple result = new MinMaxCountTuple();

		@Override
		public void reduce(Text key, Iterable<MinMaxCountTuple> values,
				Context context) throws IOException, InterruptedException {

			// Initialize our result
			result.setMin(null);
			result.setMax(null);
			int sum = 0;

			// Iterate through all input values for this key
			for (MinMaxCountTuple val : values) {

				// If the value's min is less than the result's min
				// Set the result's min to value's
				if (result.getMin() == null
						|| val.getMin().compareTo(result.getMin()) < 0) {
					result.setMin(val.getMin());
				}

				// If the value's max is less than the result's max
				// Set the result's max to value's
				if (result.getMax() == null
						|| val.getMax().compareTo(result.getMax()) > 0) {
					result.setMax(val.getMax());
				}

				// Add to our sum the count for val
				sum += val.getCount();
			}

			// Set our count to the number of input values
			result.setCount(sum);

			context.write(key, result);
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf, args)
				.getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: MinMaxCountDriver <in> <out>");
			System.exit(2);
		}
		Job job = new Job(conf, "StackOverflow Comment Date Min Max Count");
		job.setJarByClass(MinMaxCountDriver.class);
		job.setMapperClass(SOMinMaxCountMapper.class);
		job.setCombinerClass(SOMinMaxCountReducer.class);
		job.setReducerClass(SOMinMaxCountReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(MinMaxCountTuple.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

	public static class MinMaxCountTuple implements Writable {
		private Date min = new Date();
		private Date max = new Date();
		private long count = 0;

		private final static SimpleDateFormat frmt = new SimpleDateFormat(
				"yyyy-MM-dd'T'HH:mm:ss.SSS");

		public Date getMin() {
			return min;
		}

		public void setMin(Date min) {
			this.min = min;
		}

		public Date getMax() {
			return max;
		}

		public void setMax(Date max) {
			this.max = max;
		}

		public long getCount() {
			return count;
		}

		public void setCount(long count) {
			this.count = count;
		}

		@Override
		public void readFields(DataInput in) throws IOException {
			min = new Date(in.readLong());
			max = new Date(in.readLong());
			count = in.readLong();
		}

		@Override
		public void write(DataOutput out) throws IOException {
			out.writeLong(min.getTime());
			out.writeLong(max.getTime());
			out.writeLong(count);
		}

		@Override
		public String toString() {
			return frmt.format(min) + "\t" + frmt.format(max) + "\t" + count;
		}
	}
}

这里的mrdp.utils.MRDPUtils包的代码在第一篇中已经给出。

这里最重要的是自己重写了writable函数，自己定义了value类型。有时间我另开一篇博客介绍下writable函数。

map阶段不做任何比较和计算，只是简单的对comments.xml进行解析，然后把每次评论的时间解析出来，并把count赋值为1.如解析下一列

<row Id="1784" PostId="883" Text="Perfect distinction. I've made a note and agree entirely." CreationDate="2012-02-08T21:51:05.223" UserId="46" />

mapper会把UserID做为key,另外一个outTuple作为value,格式为(min,max,count)即(2012-02-08T21:51:05.223，2012-02-08T21:51:05.223，1)

compiler阶段直接调用的reduce函数，做中间处理。

reducer阶段计算我们需要的数据，即求最大值，最小值，总数。reducer的时间较简单，就是把每个uid对应的value循环取出，然后一一做比较，并计算count.

整个流程如下图：

得到的部分结果如下：

jpan@jpan-Beijing:~/Mywork/mapreducepatterns/testdata$ hadoop fs -cat output2/part-r-00000
10	2011-02-14T18:04:38.763	2012-07-10T22:57:00.757	8
101	2011-04-01T03:02:45.083	2011-04-01T06:02:33.307	2
10119	2012-02-08T13:54:38.623	2012-04-12T23:43:14.810	8
1057	2011-06-17T19:59:33.013	2011-06-17T19:59:33.013	1
10691	2012-04-19T01:15:44.573	2012-05-11T05:47:36.517	2
10872	2012-06-14T15:36:26.527	2012-06-14T15:45:43.347	4
10921	2011-12-07T18:08:04.583	2011-12-07T18:08:04.583	1
11	2011-05-06T02:51:50.370	2011-05-06T14:46:31.483	3
110	2010-08-12T14:52:09.830	2010-08-12T14:52:09.830	1
1118	2011-02-17T10:27:48.623	2011-02-25T09:25:09.597	2
11498	2011-12-30T11:09:58.057	2011-12-30T11:09:58.057	1
11682	2012-01-04T21:48:39.267	2012-01-04T21:48:39.267	1