数据倾斜定义:数据倾斜是大量的相同key被partition分配到一个分区里,其他几个key的数据不是很多task都完成了计算,而其中一个数据量特别大的key却迟迟运行部出结果,造成了’一个人累死,其他人闲死’的情况。
数据倾斜的解决
1.前面文章中论述的Combine组件利用map阶段的计算去减轻负担,但是需要注意的地方太多
2.常规有效的解决方案—->打散倾斜的key
整体思路:
(1) 在key的后面加上一个随机数,方便分配task的时候均匀分配
(2) 加上随机数的key分配均匀计算出结果后传入第二个map
(3)将传入的数切割成原来状态的key,进行聚合
解决实例,还是以经典的词频分析为例
1)第一个map
public class SkewWordcount {
public static class SkewWordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
//这块主要是代码的优化,减少创建对象的空间
Random random = new Random();
Text k = new Text();
IntWritable v = new IntWritable(1);
int numReduceTasks = 0;
@Override
protected void setup(Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
numReduceTasks = context.getNumReduceTasks();//获得用来计算的ReduceTask
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] words = value.toString().split(" ");
for (String w : words) {
//加上随机数
k.set(w + "\001" + random.nextInt(numReduceTasks)); context.write(k, v);
}
}
}
public static class SkewWordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();//分别计算每个task上的个数
}
v.set(count);
context.write(key, v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SkewWordcount.class);
job.setMapperClass(SkewWordcountMapper.class);
job.setReducerClass(SkewWordcountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置maptask端的局部聚合逻辑类
job.setCombinerClass(SkewWordcountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("f:/mrdata/wordcount/input"));
FileOutputFormat.setOutputPath(job, new Path("f:/mrdata/wordcount/skew-out"));
job.setNumReduceTasks(3);
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
(2)第二个map
public class SkewWordcount2 {
public static class SkewWordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] wordAndCount = value.toString().split("\t");
v.set(Integer.parseInt(wordAndCount[1]));
k.set(wordAndCount[0].split("\001")[0]);//将原来相同的key切割开来,恢复原来的样子
context.write(k, v);
}
}
public static class SkewWordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();//计算相同的key下的count
}
v.set(count);
context.write(key, v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SkewWordcount2.class);
job.setMapperClass(SkewWordcountMapper.class);
job.setReducerClass(SkewWordcountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置maptask端的局部聚合逻辑类
job.setCombinerClass(SkewWordcountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("f:/mrdata/wordcount/skew-out"));
FileOutputFormat.setOutputPath(job, new Path("f:/mrdata/wordcount/skew-out2"));
job.setNumReduceTasks(3);
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}