MapReduce中Shuffle过程:
原理参考 :
http://weixiaolu.iteye.com/blog/1474172
数据类型:
1.数据类型都实现Writable接口,以便这些类型定义的数据可以被序列化进行网络传输和文件存储。
2.基本数据类型
BooleanWritable、ByteWritable、DoubleWritable
FloatWritable、IntWritable、LongWritable
Text:使用UTF8格式存储文本
NULLWritable:当<key,value>中的key或value为空时使用
MapReduce-—WordCount:(MapReduce编程模型与以下程序一致)
package com.fb.hadoop.mapreduce;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class WordCount extends Configured implements Tool{
//Map Class
public static class Mapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, IntWritable>{
private Text mapOutputKey=new Text();
private final static IntWritable mapOutputValue=new IntWritable(1);
@Override
public void map(LongWritable key, Text value,
org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String lineValue=value.toString();
StringTokenizer stringTokenizer=new StringTokenizer(lineValue);
while(stringTokenizer.hasMoreTokens()){
String wordValue=stringTokenizer.nextToken();
//set value
mapOutputKey.set(wordValue);
//output
context.write(mapOutputKey, mapOutputValue);
}
}
}
//Reduce Class
public static class Reducer extends org.apache.hadoop.mapreduce.Reducer<Text, IntWritable,Text, IntWritable>{
private IntWritable outputValue=new IntWritable();
@Override
public void reduce(Text key, Iterable<IntWritable> values,
org.apache.hadoop.mapreduce.Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
int sum=0;
//iterator
for(IntWritable value:values){
sum+=value.get();
}
outputValue.set(sum);
context.write(key, outputValue);
}
}
//Driver
public int run(String[] args) throws Exception{
//get configuretion
Configuration conf=getConf();
//create Job
Job job=Job.getInstance(conf,this.getClass().getSimpleName());
//run jar
job.setJarByClass(this.getClass());
//set Job
//input -> map -> reduce -> output
//1.input
Path inPath=new Path(args[0]);
FileInputFormat.addInputPath(job, inPath);
//2.map
job.setMapperClass(Mapper.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//3.shuffle
/******************shuffle****************************
//1.partitioner
job.setPartitionerClass(cls);
//2.sort
job.setSortComparatorClass(cls);
//3.optional,combiner
job.setCombinerClass(cls);
//4.group
job.setGroupingComparatorClass(cls);
******************shuffle****************************/
//4.reduce
job.setReducerClass(Reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//set reduce number
//job.setNumReduceTasks(10);
//5.output
Path outPath=new Path(args[1]);
FileOutputFormat.setOutputPath(job, outPath);
//submit job
boolean isSuccess=job.waitForCompletion(true);
return isSuccess?0:1;
}
public static void main(String[] args) throws Exception {
Configuration conf=new Configuration();
int status=ToolRunner.run(conf, new WordCount(),args);
System.exit(status);
}
}