Hadoop_MapReduce数据倾斜
原因:当上游数据需要分组到下游的时候,因key的性质不同,可能导致某一区分到的数据远多余另一个区,出现数据不均匀的现象,可能就会导致某一个节点运行缓慢或直接卡死的现象.因为map端分任务时,是按数据大小进行划分的,所有基本不会出现此问题.所以要解决此问题,需要解决分组不匀的问题
解决方案:
1.避免分区(直接在map端join聚合后直接输出)
2.使用combiner组件,在map端进行局部聚合,减少reduce端的数据量
3.提高reduce并行度,多分配几个reduce任务,减轻每个任务的任务量
4.给节点加内存
5.在map端将key打散后再输出
6.自定义key
在map端将key打散后再输出案例:
在map方法中使用随机数将key打散,打散后分三个Reduer任务进行聚合运算,第一次聚合后再二次聚合,统计出具有数据倾斜问题的文件中的单词个数
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Random;
/**
*
*/
public class TextDemo {
static class TextMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
int reduceTasks = 0;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//获得Reduer的任务个数
reduceTasks = context.getNumReduceTasks();
}
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try {
String s = value.toString();
String[] split = s.split("\\s+");
for (String word : split) {
//使用Reduer的任务个数生成的随机数,连接key生成新的key,再进行分区,就可以避免数据倾斜
Random random = new Random();
String kk = word + "-" + random.nextInt(reduceTasks);
k.set(kk);
context.write(k,v);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
static class TextRedcer extends Reducer<Text, IntWritable,Text, IntWritable>{
IntWritable v = new IntWritable(1);
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count++;
}
v.set(count);
context.write(key,v);
}
}
static class TextMapper2 extends Mapper<LongWritable, Text,Text, IntWritable>{
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try {
String s = value.toString();
String[] split = s.split("\\s+");
String[] split1 = split[0].split("-");
k.set(split1[0]);
v.set(Integer.parseInt(split[1]));
context.write(k,v);
} catch (Exception e) {
e.printStackTrace();
}
}
}
static class TextRedcer2 extends Reducer<Text, IntWritable,Text, IntWritable>{
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum+=value.get();
}
v.set(sum);
context.write(key,v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "m");
job.setMapperClass(TextMapper2.class);
job.setReducerClass(TextRedcer2.class);
//当map和reduce类输出的类型相同时,可以省略以下两句
//job.setMapOutputKeyClass(Text.class);
//job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//job.setNumReduceTasks(3);
FileInputFormat.setInputPaths(job,new Path("D:\\txt\\mrdata\\skew\\output3"));
FileOutputFormat.setOutputPath(job,new Path("D:\\txt\\mrdata\\skew\\output5"));
job.waitForCompletion(true);
}
}
使用Combiner组件在map端进行局部聚合,避免数据倾斜
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
*使用Combiner组件在map端将数据进行局部聚合,避免数据倾斜
*/
public class MRSmallFile {
static class SmallFileMapper extends Mapper<LongWritable, Text,Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] split = s.split("\\s+");
for (String s1 : split) {
context.write(new Text(s1),new IntWritable(1));
}
}
}
static class SmallFileCombier extends Reducer<Text, IntWritable,Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count++;
}
context.write(key,new IntWritable(count));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "a");
//设置Map类
job.setMapperClass(SmallFileMapper.class);
//使用Combiner组件在map端进行局部聚合
job.setCombinerClass(SmallFileCombier.class);
//job.setReducerClass(SmallFileCombier.class);
//设置聚合的任务个数
//如果Reduce任务个数设置为0,就不会局部聚合,reduce也不会聚合
//如果Reduce任务个数为1,局部聚合会按maptask的个数聚合生成相对应个数的文件,reduce聚合就只会生成一个文件
//使用map端合并小文件时,可以不写reduce端代码,但Reduce任务个数设置为0,则不会写如到一个文件中,因为没有聚合效果,需要将Reduce任务个数设置为1
job.setNumReduceTasks(1);
//设置Reducer类输出的key
job.setOutputKeyClass(Text.class);
//设置Reducer类输出的value
job.setOutputValueClass(IntWritable.class);
//设置读取的路径
FileInputFormat.setInputPaths(job,new Path("D:\\txt\\mrdata\\wordcount - 副本\\input"));
//设置写出的路径
FileOutputFormat.setOutputPath(job,new Path("D:\\txt\\mrdata\\wordcount - 副本\\intput10"));
//任务执行完的标记
boolean b = job.waitForCompletion(true);
}
}