java mapreduce 标准差_mapreduce编程实例(4)-求中位数和标准差

最新推荐文章于 2021-02-16 06:15:06 发布

lucyjones

最新推荐文章于 2021-02-16 06:15:06 发布

阅读量200

点赞数

文章标签： java mapreduce 标准差

本文链接：https://blog.csdn.net/weixin_32258195/article/details/114141679

版权

这个实例解决问题是：计算一天的每个小时中，网站新增评论长度的中位数和这些长度之间的标准差。代码如下：

package mrdp.ch2;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Date;

import java.util.Map;

import mrdp.utils.MRDPUtils;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

public class MedianStdDevDriver {

public static class SOMedianStdDevMapper extends

Mapper{

private IntWritable outHour = new IntWritable();

private IntWritable outCommentLength = new IntWritable();

private final static SimpleDateFormat frmt = new SimpleDateFormat(

"yyyy-MM-dd'T'HH:mm:ss.SSS");

@SuppressWarnings("deprecation")

@Override

public void map(Object key, Text value, Context context)

throws IOException, InterruptedException {

// Parse the input string into a nice map

Mapparsed = MRDPUtils.transformXmlToMap(value.toString());

// Grab the "CreationDate" field,

// since it is what we are grouping by

String strDate = parsed.get("CreationDate");

// Grab the comment to find the length

String text = parsed.get("Text");

// .get will return null if the key is not there

if (strDate == null || text == null) {

// skip this record

return;

}

try {

// get the hour this comment was posted in

Date creationDate = frmt.parse(strDate);

outHour.set(creationDate.getHours());

// get the comment length

outCommentLength.set(text.length());

// write out the user ID with min max dates and count

context.write(outHour, outCommentLength);

} catch (ParseException e) {

System.err.println(e.getMessage());

return;

}

public static class SOMedianStdDevReducer extends

Reducer{

private MedianStdDevTuple result = new MedianStdDevTuple();

private ArrayListcommentLengths = new ArrayList();

@Override

public void reduce(IntWritable key, Iterablevalues,

Context context) throws IOException, InterruptedException {

float sum = 0;

float count = 0;

commentLengths.clear();

result.setStdDev(0);

// Iterate through all input values for this key

for (IntWritable val : values) {

commentLengths.add((float) val.get());

sum += val.get();

++count;

}

// sort commentLengths to calculate median

Collections.sort(commentLengths);

// if commentLengths is an even value, average middle two elements

if (count % 2 == 0) {

result.setMedian((commentLengths.get((int) count / 2 - 1) + commentLengths

.get((int) count / 2)) / 2.0f);

} else {

// else, set median to middle value

result.setMedian(commentLengths.get((int) count / 2));

}

// calculate standard deviation

float mean = sum / count;

float sumOfSquares = 0.0f;

for (Float f : commentLengths) {

sumOfSquares += (f - mean) * (f - mean);

}

result.setStdDev((float) Math.sqrt(sumOfSquares / (count - 1)));

context.write(key, result);

}

public static void main(String[] args) throws Exception {

Configuration conf = new Configuration();

String[] otherArgs = new GenericOptionsParser(conf, args)

.getRemainingArgs();

if (otherArgs.length != 2) {

System.err.println("Usage: MedianStdDevDriver ");

System.exit(2);

}

Job job = new Job(conf,

"StackOverflow Comment Length Median StdDev By Hour");

job.setJarByClass(MedianStdDevDriver.class);

job.setMapperClass(SOMedianStdDevMapper.class);

job.setReducerClass(SOMedianStdDevReducer.class);

job.setMapOutputKeyClass(IntWritable.class);

job.setMapOutputValueClass(IntWritable.class);

job.setOutputKeyClass(IntWritable.class);

job.setOutputValueClass(MedianStdDevTuple.class);

FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

public static class MedianStdDevTuple implements Writable {

private float median = 0;

private float stddev = 0f;

public float getMedian() {

return median;

}

public void setMedian(float median) {

this.median = median;

}

public float getStdDev() {

return stddev;

}

public void setStdDev(float stddev) {

this.stddev = stddev;

}

@Override

public void readFields(DataInput in) throws IOException {

median = in.readFloat();

stddev = in.readFloat();

}

@Override

public void write(DataOutput out) throws IOException {

out.writeFloat(median);

out.writeFloat(stddev);

}

@Override

public String toString() {

return median + "\t" + stddev;

}

}这里在计算中位数时稍微有点技巧，先把所有的commments 长度存入一个数组中，然后对这个数据进行排序，排序完后取下标为中间那个即可。求中间下标那个对应的长度时，分两种情况，即数组长度为偶数和奇数时，做了分别计算。

求标准差就是简单的根据数学定义求的。

计算结果如下：

jpan@jpan-Beijing:~/Mywork/mapreducepatterns/testdata$ hadoop fs -cat output3/part-r-00000

0145.5158.66512

1218.0150.04599

2139.0148.84734

3200.0158.28148

4139.5158.62466

5122.5167.31377

6199.5160.57263

7238.0175.86475

8253.5164.08226

9232.0167.5952

10200.0157.11778

11179.0144.3936

12172.0148.96738

13229.0134.17366

14207.0147.26193

15224.0147.52689

16143.0130.6711

17177.0158.20508

18199.0159.31636

19175.5147.4742

20169.0138.74756

21164.0141.22824

22152.5122.51671

23145.0160.20476