java mapreduce 标准差_MapReduce编程:最大值、最小值、平均值、计数、中位数、标准差...

MapReduce编程最基础的范例应该就是Wordcount了,然后大部分就是要做一遍最大值最小值的计算。课上老师用的课本是《MapReduce编程与设计模式》,里面第一章就介绍了Wordcount ,接下来就是最大值最小值平均值标准差,其数据来源于Stack Overflow网站上的评论内容,包括评论时间、评论用户ID,评论文本。并且是以.xml文件形式做输入文件。因此读入到mapper时需要先将xml转化为map的键值对形式。transformXmlToMap(value.toString());

以下是输入文件的形式,随便造的几组数据,只改动了评论时间与用户ID,评论文本内容是直接粘的。

这在课本上是没有看到这个函数的内部实现的,但是仍是一个基本的工具类,可以自己实现,目的就是将文本抠出来转换成map形式存储。

public static final String[] REDIS_INSTANCES = { "p0", "p1", "p2", "p3",

"p4", "p6" };

// This helper function parses the stackoverflow into a Map for us.

public static Map transformXmlToMap(String xml) {

Map map = new HashMap();

try {

String[] tokens = xml.trim().substring(5, xml.trim().length() - 3)

.split("\"");

for (int i = 0; i < tokens.length - 1; i += 2) {

String key = tokens[i].trim();

String val = tokens[i + 1];

map.put(key.substring(0, key.length() - 1), val);

}

} catch (StringIndexOutOfBoundsException e) {

System.err.println(xml);

}

return map;

}

然后接下来就是一个最大最小值类:

该代码完全取自课本原文,但接下来的平均值和标准差的计算,课本限于篇幅就没有打出完整代码,只写出了核心的mapper部分与reducer部分。但是仍可根据最大最小值的范例写出模板化的主类部分。

首先可以看到计算最大最小值类中存在三个域值,可以知道该代码处理的是一个用户多个评论中的最早评论时间【最小值】与最晚评论时间【最大值】,并且还包括一个count计算用户总量。

并且值得注意的是其中重载了toString函数,作为输出格式。

其次,MapReduce本身在运行过程中自带排序,无论用户目的是否需要排序,其都会在mapper过程中自动排序,因为排序对大多数处理都是有利的,这也是MapReduce本身的机制。

package mapreduce_2019;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.*;

import org.apache.hadoop.fs.Path;

//import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

//import org.apache.hadoop.mapreduce.Mapper.Context;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.Reducer;

public class MinMaxCountTuple implements Writable{

private Date min=new Date();///最早时间

private Date max=new Date();///最晚时间

private long count = 0;///总计数

private final static SimpleDateFormat frmt =new SimpleDateFormat(

"yyy-MM-dd'T'HH:mm:ss.SSS");

public Date getMin(){

return min;

}

public void setMin(Date min){

this.min=min;

}

public Date getMax(){

return max;

}

public void setMax(Date max){

this.max=max;

}

public long getCount(){

return count;

}

public void setCount(long count){

this.count=count;

}

public void readFields(DataInput in) throws IOException{

min=new Date(in.readLong());

max=new Date(in.readLong());

count=in.readLong();

}

public void write(DataOutput out) throws IOException {

out.writeLong(min.getTime());

out.writeLong(max.getTime());

out.writeLong(count);

}

public String toString(){

return frmt.format(min)+"\t"+frmt.format(max)+"\t"+count;

}

public static final String[] REDIS_INSTANCES = { "p0", "p1", "p2", "p3",

"p4", "p6" };

// This helper function parses the stackoverflow into a Map for us.

public static Map transformXmlToMap(String xml) {

Map map = new HashMap();

try {

String[] tokens = xml.trim().substring(5, xml.trim().length() - 3)

.split("\"");

for (int i = 0; i < tokens.length - 1; i += 2) {

String key = tokens[i].trim();

String val = tokens[i + 1];

map.put(key.substring(0, key.length() - 1), val);

}

} catch (StringIndexOutOfBoundsException e) {

System.err.println(xml);

}

return map;

}

public static class MinMaxCountMapper extends Mapper {

private Text outUserId = new Text();

private MinMaxCountTuple outTuple = new MinMaxCountTuple();

private final static SimpleDateFormat frmt= new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");

public void map(Object key,Text value,Context context)

throws IOException,InterruptedException{

Map parsed=transformXmlToMap(value.toString());

String strDate = parsed.get("CreationDate");

String userId = parsed.get("UserId");

Date creationDate = null;

try {

creationDate = frmt.parse(strDate);

} catch (ParseException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

outTuple.setMin(creationDate);

outTuple.setMax(creationDate);

outTuple.setCount(1);

outUserId.set(userId);

context.write(outUserId, outTuple);

}

}

public static class MinMaxCountReducer

extends Reducer{

private MinMaxCountTuple result = new MinMaxCountTuple();

public void reduce(Text key,Iterable values,Context context) throws IOException,InterruptedException{

result.setMin(null);

result.setMax(null);

result.setCount(0);

int sum=0;

for(MinMaxCountTuple val:values){

if(result.getMin()==null||val.getMin().compareTo(result.getMin())<0){

result.setMin(val.getMin());

}

if(result.getMax()==null||result.getMax().compareTo(result.getMax())>0){

result.setMax(val.getMax());

}

sum+=val.getCount();

}

result.setCount(sum);

context.write(key, result);

}

}

/*============================================================================================================*/

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

// FileUtil.deleteDir("output");

// String[] otherargs = new String[]{"hdfs://master:9000/input/Comments.xml", "hdfs://master:9000/output1"};

// if (otherargs.length != 2) {

// System.err.println("Usage: mergesort ");

// System.exit(2);

// }

Job job = Job.getInstance();

job.setJarByClass(MinMaxCountTuple.class);

job.setMapperClass(MinMaxCountMapper.class);

job.setReducerClass(MinMaxCountReducer.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(MinMaxCountTuple.class);

FileInputFormat.addInputPath(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

}

此代码输入带两个参数:文件输入路径,结果输出路径。

将其导出.jar包之后再hdfs上运行即可。

接下来是平均值与计数的代码,其中只有核心mapper与reducer部分是课本原文。

根据上方样例自行加上主类与主函数。

可以注意到此处的平均值即评论文本长度的平均值

package mapreduce_2019;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.Date;

import java.util.HashMap;

import java.util.Map;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.Reducer;

public class CountAverageTuple implements Writable{

private float count =0;///总计数

private float average=0;///平均值

public float getCount(){

return count;

}

public void setCount(float count2){

this.count=count2;

}

public float getAverage(){

return average;

}

public void setAverage(float f){

this.average=f;

}

public void readFields(DataInput in) throws IOException{

count=in.readFloat();

average=in.readFloat();

}

public void write(DataOutput out) throws IOException {

out.writeFloat(count);

out.writeFloat(average);

}

public String toString(){///重载toString函数定义输出格式

return count+"\t"+average;

}

public static final String[] REDIS_INSTANCES = { "p0", "p1", "p2", "p3",

"p4", "p6" };

public static Map transformXmlToMap(String xml) {

Map map = new HashMap();

try {

String[] tokens = xml.trim().substring(5, xml.trim().length() - 3).split("\"");

for (int i = 0; i < tokens.length - 1; i += 2) {

String key = tokens[i].trim();

String val = tokens[i + 1];

map.put(key.substring(0, key.length() - 1), val);

}

} catch (StringIndexOutOfBoundsException e) {

System.err.println(xml);

}

return map;

}

public static class AverageMapper extends Mapper {

private IntWritable outHour = new IntWritable();

private CountAverageTuple outCountAverage = new CountAverageTuple();

private final static SimpleDateFormat frmt = new SimpleDateFormat ("yyyy-MM-dd'T'HH:mm:ss.SSS");

public void map (Object key, Text value, Context context)

throws IOException, InterruptedException {

Map parsed = transformXmlToMap (value.toString());

String strDate = parsed.get("CreationDate");

String text = parsed.get("Text");

Date creationDate = null;

try {

creationDate = frmt.parse(strDate);

} catch (ParseException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

outHour.set(creationDate.getHours());///时间戳

outCountAverage.setCount(1);

outCountAverage.setAverage (text.length());

context.write(outHour, outCountAverage);

}

}

public static class AverageReducer

extends Reducer {

private CountAverageTuple result = new CountAverageTuple();

public void reduce (IntWritable key, Iterable values, Context context) throws IOException, InterruptedException {

float sum = 0;

float count = 0;

for (CountAverageTuple val : values) {

sum+=val.getCount() * val.getAverage();

count +=val.getCount();

}

result.setCount (count) ;

result.setAverage (sum / count);///计算平均值

context.write(key,result);

}

}

/*============================================================================================================*/

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

// FileUtil.deleteDir("output");

String[] otherargs = new String[]{"hdfs://master:9000/input/Comments.xml", "hdfs://master:9000/output2"};

if (otherargs.length != 2) {

System.err.println("Usage: mergesort ");

System.exit(2);

}

Job job = Job.getInstance();

job.setJarByClass(CountAverageTuple.class);

job.setMapperClass(AverageMapper.class);

job.setReducerClass(AverageReducer.class);

job.setOutputKeyClass(IntWritable.class);

job.setOutputValueClass(CountAverageTuple.class);

FileInputFormat.addInputPath(job, new Path(otherargs[0]));

FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

}

标准差与中位数的计算,这个代码是不带内存优化的最朴实版本,可以知道一个用户有多个评论,那么将其作为键值对的形式存储可以很大程度上节省空间,原本需要遍历每条评论的方式,经过key-value整合后,一个用户有几条评论,根据该数量可以快速判断需要的中位数是否在该key下,前期可以略过大部分评论数量较多的用户数据块。

package mapreduce_2019;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Date;

import java.util.HashMap;

import java.util.Map;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MedianStdDevTuple implements Writable{

private float median =0;///中位数

private float stddev =0;///标准差

private final static SimpleDateFormat frmt =new SimpleDateFormat(

"yyy-MM-dd'T'HH:mm:ss.SSS");

public float getMedian(){

return median;

}

public void setMedian(float median){

this.median=median;

}

public float getStdDev(){

return stddev;

}

public void setStdDev(float f){

this.stddev=f;

}

public String toString(){///重载toString定义输出格式

return median+"\t"+stddev;

}

@Override

public void readFields(DataInput arg0) throws IOException {

// TODO Auto-generated method stub

}

@Override

public void write(DataOutput arg0) throws IOException {

// TODO Auto-generated method stub

}

public static final String[] REDIS_INSTANCES = { "p0", "p1", "p2", "p3",

"p4", "p6" };

public static Map transformXmlToMap(String xml) {

Map map = new HashMap();

try {

String[] tokens = xml.trim().substring(5, xml.trim().length() - 3).split("\"");

for (int i = 0; i < tokens.length - 1; i += 2) {

String key = tokens[i].trim();

String val = tokens[i + 1];

map.put(key.substring(0, key.length() - 1), val);

}

} catch (StringIndexOutOfBoundsException e) {

System.err.println(xml);

}

return map;

}

public static class MedianStdDevMapper extends Mapper {

private IntWritable outHour = new IntWritable();

private IntWritable outCommentLength = new IntWritable();

private final static SimpleDateFormat frmt= new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");

public void map(Object key,Text value,Context context) throws IOException,InterruptedException{

Map parsed=transformXmlToMap(value.toString());

String strDate = parsed.get("CreationDate");

String text = parsed.get("Text");

Date creationDate = null;

try {

creationDate = frmt.parse(strDate);

} catch (ParseException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

outHour.set(creationDate.getHours());

outCommentLength.set(text.length());

context.write(outHour,outCommentLength);

}

}

public static class MedianStdDevReducer extends Reducer {

private MedianStdDevTuple result = new MedianStdDevTuple();

private ArrayList commentLengths = new ArrayList ();

public void reduce (IntWritable key,Iterable values,Context context) throws IOException,InterruptedException{

float sum= 0 ;

float count = 0;

commentLengths.clear();

result.setStdDev(0);

for(IntWritable val:values){

commentLengths.add((float) val.get());

sum+=val.get();

++count;

}

Collections.sort(commentLengths);///按评论长度二次排序

if(count % 2==0){//判断总数奇偶性,偶数需要对中间两个中位数取平均值

result.setMedian((commentLengths.get((int)count/2-1)+commentLengths.get((int)count/2))/2.0f);

}

else{

result.setMedian(commentLengths.get((int)count/2));

}

float mean=sum/count;

float sumOfSquares = 0.0f;

for(Float f: commentLengths){///标准差计算公式

sumOfSquares+=(f-mean)*(f-mean);

}

result.setStdDev((float)Math.sqrt(sumOfSquares/(count-1)));

context.write(key, result);

}

}

/*============================================================================================================*/

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

String[] otherargs = new String[]{"hdfs://master:9000/input/Comments.xml", "hdfs://master:9000/output3"};

if (otherargs.length != 2) {

System.err.println("Usage: mergesort ");

System.exit(2);

}

Job job = Job.getInstance();

job.setJarByClass(MedianStdDevTuple.class);

job.setMapperClass(MedianStdDevMapper.class);

job.setReducerClass(MedianStdDevReducer.class);

job.setOutputKeyClass(IntWritable.class);

job.setOutputValueClass(IntWritable.class);

FileInputFormat.addInputPath(job, new Path(otherargs[0]));

FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值