打散倾斜原理:
1、获取NumReduceTasks的个数,并将其随机
2、在map结果的返回值中,将随机数拼接到key上。
3、得出的结果再重新进行mapreduce计算,将后缀切掉,重新聚合。
第一次mapreduce:
public class SkewWordcount {
public static class SkewWordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
Random random = new Random();
Text k = new Text();
IntWritable v = new IntWritable(1);
int numReduceTasks = 0;
@Override
protected void setup(Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
numReduceTasks = context.getNumReduceTasks();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] words = value.toString().split(" ");
for (String w : words) {
k.set(w + "\001" + random.nextInt(numReduceTasks));
context.write(k, v);
}
}
}
public static class SkewWordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
v.set(count);
context.write(key, v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SkewWordcount.class);
job.setMapperClass(SkewWordcountMapper.class);
job.setReducerClass(SkewWordcountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置maptask端的局部聚合逻辑类
job.setCombinerClass(SkewWordcountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("f:/mrdata/wordcount/input"));
FileOutputFormat.setOutputPath(job, new Path("f:/mrdata/wordcount/skew-out"));
job.setNumReduceTasks(3);
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
第二次mapreduce
public class SkewWordcount2 {
public static class SkewWordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] wordAndCount = value.toString().split("\t");
v.set(Integer.parseInt(wordAndCount[1]));
k.set(wordAndCount[0].split("\001")[0]);
context.write(k, v);
}
}
public static class SkewWordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
v.set(count);
context.write(key, v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SkewWordcount2.class);
job.setMapperClass(SkewWordcountMapper.class);
job.setReducerClass(SkewWordcountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置maptask端的局部聚合逻辑类
job.setCombinerClass(SkewWordcountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("f:/mrdata/wordcount/skew-out"));
FileOutputFormat.setOutputPath(job, new Path("f:/mrdata/wordcount/skew-out2"));
job.setNumReduceTasks(3);
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}