大数据之Hadoop_MapReduce数据倾斜问题及解决方案

Hadoop_MapReduce数据倾斜

原因:当上游数据需要分组到下游的时候,因key的性质不同,可能导致某一区分到的数据远多余另一个区,出现数据不均匀的现象,可能就会导致某一个节点运行缓慢或直接卡死的现象.因为map端分任务时,是按数据大小进行划分的,所有基本不会出现此问题.所以要解决此问题,需要解决分组不匀的问题

解决方案:
1.避免分区(直接在map端join聚合后直接输出)
2.使用combiner组件,在map端进行局部聚合,减少reduce端的数据量
3.提高reduce并行度,多分配几个reduce任务,减轻每个任务的任务量
4.给节点加内存
5.在map端将key打散后再输出
6.自定义key

在map端将key打散后再输出案例:

在map方法中使用随机数将key打散,打散后分三个Reduer任务进行聚合运算,第一次聚合后再二次聚合,统计出具有数据倾斜问题的文件中的单词个数

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Random;

/**
 *
 */
public class TextDemo {
    static class TextMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
        int reduceTasks = 0;
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
        //获得Reduer的任务个数
            reduceTasks = context.getNumReduceTasks();
        }

        Text k = new Text();
        IntWritable v = new IntWritable(1);
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            try {
                String s = value.toString();
                String[] split = s.split("\\s+");
                for (String word : split) {
                //使用Reduer的任务个数生成的随机数,连接key生成新的key,再进行分区,就可以避免数据倾斜
                    Random random = new Random();
                    String kk = word + "-" + random.nextInt(reduceTasks);
                    k.set(kk);
                    context.write(k,v);
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
    static class TextRedcer extends Reducer<Text, IntWritable,Text, IntWritable>{
        IntWritable v = new IntWritable(1);
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable value : values) {
                count++;
            }
            v.set(count);
            context.write(key,v);
        }
    }

    static class TextMapper2 extends Mapper<LongWritable, Text,Text, IntWritable>{
        Text k = new Text();
        IntWritable v = new IntWritable(1);
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            try {
                String s = value.toString();
                String[] split = s.split("\\s+");
                String[] split1 = split[0].split("-");
                k.set(split1[0]);
                v.set(Integer.parseInt(split[1]));
                context.write(k,v);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
    static class TextRedcer2 extends Reducer<Text, IntWritable,Text, IntWritable>{
        IntWritable v = new IntWritable();
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum+=value.get();
            }
            v.set(sum);
            context.write(key,v);
        }
    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "m");
        job.setMapperClass(TextMapper2.class);
        job.setReducerClass(TextRedcer2.class);
        //当map和reduce类输出的类型相同时,可以省略以下两句
        //job.setMapOutputKeyClass(Text.class);
        //job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //job.setNumReduceTasks(3);
        FileInputFormat.setInputPaths(job,new Path("D:\\txt\\mrdata\\skew\\output3"));
        FileOutputFormat.setOutputPath(job,new Path("D:\\txt\\mrdata\\skew\\output5"));
        job.waitForCompletion(true);
    }
}

使用Combiner组件在map端进行局部聚合,避免数据倾斜

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 *使用Combiner组件在map端将数据进行局部聚合,避免数据倾斜
 */
public class MRSmallFile {

    static class SmallFileMapper extends Mapper<LongWritable, Text,Text, IntWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String s = value.toString();
            String[] split = s.split("\\s+");
            for (String s1 : split) {
                context.write(new Text(s1),new IntWritable(1));
            }
        }
    }

    static class SmallFileCombier extends Reducer<Text, IntWritable,Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable value : values) {
                count++;
            }
            context.write(key,new IntWritable(count));
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "a");

        //设置Map类
        job.setMapperClass(SmallFileMapper.class);
        
        //使用Combiner组件在map端进行局部聚合
        job.setCombinerClass(SmallFileCombier.class);
        //job.setReducerClass(SmallFileCombier.class);

        //设置聚合的任务个数
        //如果Reduce任务个数设置为0,就不会局部聚合,reduce也不会聚合
        //如果Reduce任务个数为1,局部聚合会按maptask的个数聚合生成相对应个数的文件,reduce聚合就只会生成一个文件
        //使用map端合并小文件时,可以不写reduce端代码,但Reduce任务个数设置为0,则不会写如到一个文件中,因为没有聚合效果,需要将Reduce任务个数设置为1
        job.setNumReduceTasks(1);

        //设置Reducer类输出的key
        job.setOutputKeyClass(Text.class);
        //设置Reducer类输出的value
        job.setOutputValueClass(IntWritable.class);
        //设置读取的路径
        FileInputFormat.setInputPaths(job,new Path("D:\\txt\\mrdata\\wordcount - 副本\\input"));
        //设置写出的路径
        FileOutputFormat.setOutputPath(job,new Path("D:\\txt\\mrdata\\wordcount - 副本\\intput10"));
        //任务执行完的标记
        boolean b = job.waitForCompletion(true);
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值