MapReduce编程小案例.11th—数据倾斜场景part2

MapReduce编程小案例.11th—数据倾斜场景

数据:

a a a a a a b b b a a a

a a a a c c b c a a a c

a b b c a a d d e e f f

f g a a a b a b h h g j

 

需求:

需要做wordcount

但是,会有一个问题存在:

a特别多,

负责处理a这个单词数据的reduce worker就会很累(负载不均衡,过大)

思考:如何处理?会让整个数据处理过程中,数据倾斜的状况得到缓解。


思路:

key进行打散(具体方案:给key拼接一个随机字符串),以均衡reduce端worker的负载;


数据倾斜场景part2-解决代码方法:

package cn.edu360.mr.wc.skew;

import java.io.IOException;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class SkewWordcount {
	
	
	
	public static class SkewWordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
		
		Random random = new Random();
		Text k = new Text();
		IntWritable v = new IntWritable(1);
		int numReduceTasks = 0;
		
		@Override
		protected void setup(Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			numReduceTasks = context.getNumReduceTasks();
		}
		
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			String[] words = value.toString().split(" ");
			for (String w : words) {
				k.set(w + "\001" + random.nextInt(numReduceTasks));
				context.write(k, v);
			}
		
		}
		
	}
	
	public static class SkewWordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
		
		IntWritable v = new IntWritable();
		
		
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,
				Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
			int count = 0;
			for (IntWritable value : values) {
				count +=value.get();
			}
			v.set(count);
			context.write(key, v);
		}
	}
	
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(SkewWordcount.class);
		
		job.setMapperClass(SkewWordcountMapper.class);
		job.setReducerClass(SkewWordcountReducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path("f://mrdata/wordcount/input"));
		FileOutputFormat.setOutputPath(job, new Path("f://mrdata/wordcount/skew-out"));
		
		job.setNumReduceTasks(3);
		
		boolean res = job.waitForCompletion(true);
		
		System.exit(res?0:1);
		
	}
	

}
package cn.edu360.mr.wc.skew;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class SkewWordcount2 {
	
	public static class SkewWordcount2Mapper extends Mapper<LongWritable, Text, Text, IntWritable>{
		
		Text k = new Text();
		IntWritable v = new IntWritable();
		
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			String[] wordAndCount = value.toString().split("\t");
			v.set(Integer.parseInt(wordAndCount[1]));
			k.set(wordAndCount[0].split("\001")[0]);
			context.write(k, v);
		}

	}
	
	
	public static class SkewWordcount2Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
		IntWritable v = new IntWritable();

		@Override
		protected void reduce(Text key, Iterable<IntWritable> values, Context context)
				throws IOException, InterruptedException {
			int count = 0;
			for (IntWritable value : values) {
				count += value.get();
			}
			v.set(count);
			context.write(key, v);
		}

	}
	
	
	public static void main(String[] args) throws Exception {

		
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);
		
		job.setJarByClass(SkewWordcount2.class);
		
		job.setMapperClass(SkewWordcount2Mapper.class);
		job.setReducerClass(SkewWordcount2Reducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path("f:/mrdata/wordcount/skew-out"));
		FileOutputFormat.setOutputPath(job, new Path("f:/mrdata/wordcount/skew-out2"));
		
		job.setNumReduceTasks(3);
		
		boolean res = job.waitForCompletion(true);
		System.exit(res?0:1);
	
	}	

}


阅读更多
文章标签: MapReduce 大数据
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

不良信息举报

MapReduce编程小案例.11th—数据倾斜场景part2

最多只允许输入30个字

加入CSDN,享受更精准的内容推荐,与500万程序员共同成长!
关闭
关闭