java mapreduce 标准差_mapreduce编程实例(4)-求中位数和标准差

这个实例解决问题是:计算一天的每个小时中,网站新增评论长度的中位数和这些长度之间的标准差。代码如下:

package mrdp.ch2;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Date;

import java.util.Map;

import mrdp.utils.MRDPUtils;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

public class MedianStdDevDriver {

public static class SOMedianStdDevMapper extends

Mapper{

private IntWritable outHour = new IntWritable();

private IntWritable outCommentLength = new IntWritable();

private final static SimpleDateFormat frmt = new SimpleDateFormat(

"yyyy-MM-dd'T'HH:mm:ss.SSS");

@SuppressWarnings("deprecation")

@Override

public void map(Object key, Text value, Context context)

throws IOException, InterruptedException {

// Parse the input string into a nice map

Mapparsed = MRDPUtils.transformXmlToMap(value.toString());

// Grab the "CreationDate" field,

// since it is what we are grouping by

String strDate = parsed.get("CreationDate");

// Grab the comment to find the length

String text = parsed.get("Text");

// .get will return null if the key is not there

if (strDate == null || text == null) {

// skip this record

return;

}

try {

// get the hour this comment was posted in

Date creationDate = frmt.parse(strDate);

outHour.set(creationDate.getHours());

// get the comment length

outCommentLength.set(text.length());

// write out the user ID with min max dates and count

context.write(outHour, outCommentLength);

} catch (ParseException e) {

System.err.println(e.getMessage());

return;

}

}

}

public static class SOMedianStdDevReducer extends

Reducer{

private MedianStdDevTuple result = new MedianStdDevTuple();

private ArrayListcommentLengths = new ArrayList();

@Override

public void reduce(IntWritable key, Iterablevalues,

Context context) throws IOException, InterruptedException {

float sum = 0;

float count = 0;

commentLengths.clear();

result.setStdDev(0);

// Iterate through all input values for this key

for (IntWritable val : values) {

commentLengths.add((float) val.get());

sum += val.get();

++count;

}

// sort commentLengths to calculate median

Collections.sort(commentLengths);

// if commentLengths is an even value, average middle two elements

if (count % 2 == 0) {

result.setMedian((commentLengths.get((int) count / 2 - 1) + commentLengths

.get((int) count / 2)) / 2.0f);

} else {

// else, set median to middle value

result.setMedian(commentLengths.get((int) count / 2));

}

// calculate standard deviation

float mean = sum / count;

float sumOfSquares = 0.0f;

for (Float f : commentLengths) {

sumOfSquares += (f - mean) * (f - mean);

}

result.setStdDev((float) Math.sqrt(sumOfSquares / (count - 1)));

context.write(key, result);

}

}

public static void main(String[] args) throws Exception {

Configuration conf = new Configuration();

String[] otherArgs = new GenericOptionsParser(conf, args)

.getRemainingArgs();

if (otherArgs.length != 2) {

System.err.println("Usage: MedianStdDevDriver ");

System.exit(2);

}

Job job = new Job(conf,

"StackOverflow Comment Length Median StdDev By Hour");

job.setJarByClass(MedianStdDevDriver.class);

job.setMapperClass(SOMedianStdDevMapper.class);

job.setReducerClass(SOMedianStdDevReducer.class);

job.setMapOutputKeyClass(IntWritable.class);

job.setMapOutputValueClass(IntWritable.class);

job.setOutputKeyClass(IntWritable.class);

job.setOutputValueClass(MedianStdDevTuple.class);

FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

public static class MedianStdDevTuple implements Writable {

private float median = 0;

private float stddev = 0f;

public float getMedian() {

return median;

}

public void setMedian(float median) {

this.median = median;

}

public float getStdDev() {

return stddev;

}

public void setStdDev(float stddev) {

this.stddev = stddev;

}

@Override

public void readFields(DataInput in) throws IOException {

median = in.readFloat();

stddev = in.readFloat();

}

@Override

public void write(DataOutput out) throws IOException {

out.writeFloat(median);

out.writeFloat(stddev);

}

@Override

public String toString() {

return median + "\t" + stddev;

}

}

}这里在计算中位数时稍微有点技巧,先把所有的commments 长度存入一个数组中,然后对这个数据进行排序,排序完后取下标为中间那个即可。求中间下标那个对应的长度时,分两种情况,即数组长度为偶数和奇数时,做了分别计算。

求标准差就是简单的根据数学定义求的。

计算结果如下:

jpan@jpan-Beijing:~/Mywork/mapreducepatterns/testdata$ hadoop fs -cat output3/part-r-00000

0145.5158.66512

1218.0150.04599

2139.0148.84734

3200.0158.28148

4139.5158.62466

5122.5167.31377

6199.5160.57263

7238.0175.86475

8253.5164.08226

9232.0167.5952

10200.0157.11778

11179.0144.3936

12172.0148.96738

13229.0134.17366

14207.0147.26193

15224.0147.52689

16143.0130.6711

17177.0158.20508

18199.0159.31636

19175.5147.4742

20169.0138.74756

21164.0141.22824

22152.5122.51671

23145.0160.20476

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值