wordcount为例
以MR程序的WC为例 如果处理的数据只有四个单词 a b c d
a 和 c 出现的次数特别多
(a/c.hashCode%2=1)–>分区 1 多
b 和 d 出现的次数很少 (b/d.hashCode%2=0) -->分区0 少
a c -->reduce1 a: iterator<1,1,1,1,1,1,1,1,1,1,1,1,1,1> 时间久(排序 内存)
b d -->reduce0 b: iterator<1,1> 时间短
现象:导致整个job执行很久 , 无法执行完毕
解决方案—将key打散(会拉长mapreduce的任务链)
/**
* 将key打散(重新设置key,在key后生成一个随机数)
*/
public class Skew {
static class SkewMapper extends Mapper<LongWritable, Text,Text, IntWritable>{
int numReduceTasks = 0;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获取reduce task的个数
numReduceTasks = context.getNumReduceTasks();
}
Random r = new Random();
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] words = value.toString().split("\\s+");
for (String word : words) {
//根据reduce task的个数生成随机数
int i = r.nextInt(numReduceTasks);
k.set(word+"-"+i);
context.write(k,v);
}
}
}
static class SkewReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count++;
}
v.set(count);
context.write(key,v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "skew");
//设置map reduce 逻辑类
job.setMapperClass(SkewMapper.class);
job.setReducerClass(SkewReducer.class);
//设置输出类
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(2);
//设置数据输入 输出
FileInputFormat.setInputPaths(job, new Path("D:\\MR\\skew\\input"));
FileOutputFormat.setOutputPath(job, new Path("D:\\MR\\skew\\output"));
job.waitForCompletion(true);
}
}
public class Skew2 {
static class Skew2Mapper extends Mapper<LongWritable, Text,Text, IntWritable>{
Text k = new Text();
IntWritable v = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//a-1 896
String line = value.toString();
String[] split = line.split("-");
String word = split[0];
String[] split1 = split[1].split("\\s+");
int count = Integer.parseInt(split1[1]);
k.set(word);
v.set(count);
context.write(k,v);
}
}
static class Skew2Reducer extends Reducer<Text,IntWritable,Text,IntWritable>{
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
//把a 对应的 次数 相加
for (IntWritable value : values) {
count+=value.get();
}
v.set(count);
context.write(key,v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "skew");
//设置map reduce 逻辑类
job.setMapperClass(Skew2Mapper.class);
job.setReducerClass(Skew2Reducer.class);
//设置输出类
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置数据输入 输出
FileInputFormat.setInputPaths(job, new Path("D:\\MR\\skew\\output"));
FileOutputFormat.setOutputPath(job, new Path("D:\\MR\\skew\\output2"));
job.waitForCompletion(true);
}
}