MapReduce编程小案例.11th—数据倾斜场景part1

MapReduce编程小案例.11th—数据倾斜场景

数据:

a a a a a a b b b a a a

a a a a c c b c a a a c

a b b c a a d d e e f f

f g a a a b a b h h g j

 

需求:

需要做wordcount

但是,会有一个问题存在:

a特别多,

负责处理a这个单词数据的reduce worker就会很累(负载不均衡,过大)

思考:如何处理?会让整个数据处理过程中,数据倾斜的状况得到缓解。


数据倾斜场景part1-解决代码方法:

WordcountCombiner类实现

package cn.edu360.mr.wc;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordcountCombiner extends Reducer<Text, IntWritable, Text, IntWritable>{
	
	@Override
	protected void reduce(Text key, Iterable<IntWritable> values,
			Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
		int count = 0;
		for (IntWritable value : values) {
			count += value.get();
		}
		context.write(key, new IntWritable(count));
	}

}

WordcountMapper类实现

package cn.edu360.mr.wc;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/*
 * KEYIN:是map task读取到的数据的key的类型;是一行的起始偏移量Long
 * VALUE:是map task读取的数据value的类型:是一行的内容String
 * 
 * KEYOUT:是用户的自定义map方法返回的结果kv数据的key的类型;在wordcount逻辑中,我们需要返回的是单词String
 * VALUEOUT:是用户自定义map方法返回结果kv的value的类型:在wordcount逻辑中,我们需要返回的是整数Integer
 * 
 * 但是:在MapReduce中,map产生的数据需要传输给reduce,需要进行序列化和反序列化,而jdk中的原生序列化机制产生的数据量比较冗余,就会导致数据在MapReduce运行过程中传输效率低下
 * 所以,hadoop专门设计了自己的序列化机制,那么MapReduce中传输的数据类型就必须实现hadoop自己的序列化接口
 * 
 * hadoop为jdk中的常用基本类型Long,String,Integer,Float等数据类型封装了自己实现的hadoop序列化接口的类型:LongWriable,Text,Intwritable,Floatwritable
 * 
 * 
 */

public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
	
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		//切单词
		String line = value.toString();
		String[] words = line.split(" ");
		for(String word : words) {			
			context.write(new Text(word), new IntWritable(1));
		}
	
	}

}

WordcountReducer类

package cn.edu360.mr.wc;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
	
	@Override
	protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
        
		int count = 0;
		
		Iterator<IntWritable> iterator = values.iterator();
		while(iterator.hasNext()){
			
			IntWritable value = iterator.next();
			count += value.get();
			
		}
		context.write(key, new IntWritable(count));
		
	}

}

JobSubmitterWindowsLocal类实现

package cn.edu360.mr.wc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobSubmitterWindowsLocal {

	public static void main(String[] args) throws Exception{
		//没指定默认文件系统
		//没指定MapReduce job提交到哪里进行
		Configuration conf = new Configuration();
		
		conf.set("fs.defaultFS", "file:///");
		
		conf.set("mapreduce.framework.name", "local");
		
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(JobSubmitterLinuxToYarn.class);
		
		job.setMapperClass(WordcountMapper.class);
		job.setReducerClass(WordcountReducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		
		//设置maptask端的局部聚合逻辑类
		job.setCombinerClass(WordcountCombiner.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job,new Path("f:/mrdata/wordcount/input"));
		FileOutputFormat.setOutputPath(job, new Path("f:/mrdata/wordcount/output2"));
		
		job.setNumReduceTasks(3);
		
		boolean res = job.waitForCompletion(true);
		
		System.exit(res?0:1);
	}
}

阅读更多
文章标签: MapReduce 大数据
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

不良信息举报

MapReduce编程小案例.11th—数据倾斜场景part1

最多只允许输入30个字

加入CSDN,享受更精准的内容推荐,与500万程序员共同成长!
关闭
关闭