MapReduce在shuffle数据的阶段,大量的数据发送到一个节点,造成此节点繁忙甚至瘫痪,而其他节点资源空闲,为了解决这个问题,有如下两种解决方案;
1.自定义key
以单词的重复个数统计为例,假如一个文本里面都是100 有100个,how有100个 are 有100个 you有100个为例,在一个mapreduce完成后,如果reduce的个数设置为3个,可能这些数据都会跑到一个partition里面,为了解决这个问题,采取重新设计map中key的方式,比如:
100_0 : 30
100_1:30
100_2:40
采用map中key的加下划线的方式,此时就有可能发送到不同节点了.
第一个阶段:map中重新修改key
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.Random;
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
//设置一个随机数
Random r = new Random();
int i;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获取reduce个数
i = context.getNumReduceTasks();
}
/**
* map函数,被调用过程是通过while循环每行调用一次
* 重新设计key
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将value变为string格式
String line = value.toString();
//将一行文本进行截串
String[] arr = line.split(" ");
for (String word : arr) {
String newWord = word + "_" + r.nextInt(i);//通过随机的key,拼接成新的key
context.write(new Text(newWord), new IntWritable(1));//到reduce阶段
}
}
}
第二阶段reduce阶段
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WCReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
/**
* 通过迭代所有的key进行聚合
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable value : values){
sum += value.get();
}
context.write(key,new IntWritable(sum));
}
}
第三阶段:拆分key_0,1,2 : 个数 map
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WCMapper2 extends Mapper<LongWritable, Text, Text, IntWritable> {
/**
* map函数,被调用过程是通过while循环每行调用一次
* 重新设计key
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将value变为string格式
String line = value.toString();
//切割一行文本为k-v
String[] arr = line.split("\t");
String word = arr[0];
Integer count = Integer.parseInt(arr[1]);
//将一行文本进行截串
String newWord = word.split("_")[0];
context.write(new Text(newWord), new IntWritable(count));
}
}
第四阶段:重新reduce阶段,计算分割后的count个数,和第二阶段的reduce 阶段基本一致,最主要的Mainapp中的定义
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WCApp {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "file:///");
conf.set("student.name","TianYe");
//通过配置文件初始化job
Job job = Job.getInstance(conf);
//设置job名称
job.setJobName("word count");
//job入口函数类
job.setJarByClass(WCApp.class);
//设置mapper类
job.setMapperClass(WCMapper.class);
//设置reducer类
job.setReducerClass(WCReducer.class);
//设置map的输出k-v类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置reduce的输出k-v类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//FileInputFormat.setMaxInputSplitSize(job,10);
//FileInputFormat.setMinInputSplitSize(job,10);
//设置输入路径
FileInputFormat.addInputPath(job, new Path("D:/wc/2.txt"));
//设置输出路径
FileOutputFormat.setOutputPath(job, new Path("D:/wc/out"));
//设置三个reduce
job.setNumReduceTasks(3);
//执行job
boolean b = job.waitForCompletion(true);
if(b){
Job job2 = Job.getInstance(conf);
job2.setJobName("word count2");
job2.setJarByClass(WCApp.class);
job2.setMapperClass(WCMapper2.class);
job2.setReducerClass(WCReducer2.class);
job2.setMapOutputKeyClass(Text.class);
job2.setMapOutputValueClass(IntWritable.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job2, new Path("D:/wc/out"));
FileOutputFormat.setOutputPath(job2, new Path("D:/wc/out2"));
job2.waitForCompletion(true);
}
}
}
2.随机分区
随机分区就是每次从随机数里获取分区,自定义分区实现
Partitioner<Text,IntWritable>
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
import java.util.Random;
public class RandomPartitioner extends Partitioner<Text,IntWritable> {
Random r = new Random();
@Override
public int getPartition(Text text, IntWritable intWritable, int numPartitions) {
return r.nextInt(numPartitions);
}
}
在main中注册这个自定义分区函数:
job.setPartitionerClass(RandomPartitioner.class);