数据倾斜:大量数据涌向到一个或者几个reduce,造成大量的reduce空闲。
解决数据倾斜方案2:自定义分区类---二次作业
下面以单次统计为例进行说明:
1、DataLeanMapper1
package hadoop.lean.partitioner;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.Random;
/**
* DataLeanMapper1
*/
public class DataLeanMapper1 extends Mapper<LongWritable, Text, Text,IntWritable> {
Random r = new Random();
/**
* 每一行
*/
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] arr = line.split(" ");
Text keyOut = new Text();
IntWritable valueOut = new IntWritable(1);
for(String word : arr){
keyOut.set(word);
context.write(keyOut,valueOut);
}
}
}
2、DataLeanMapper2
package hadoop.lean.partitioner;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.Random;
public class DataLeanMapper2 extends Mapper<Text, Text, Text,IntWritable> {
/**
* 每一行
*/
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
context.write(key , new IntWritable(Integer.parseInt(value.toString())));
}
}
3、DataLeanReducer1
package hadoop.lean.partitioner;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* DataLeanReducer1
*/
public class DataLeanReducer1 extends Reducer<Text, IntWritable, Text, IntWritable>{
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0 ;
for(IntWritable iw : values){
count = count + iw.get() ;
}
context.write(key,new IntWritable(count));
}
}
4、RandomPartitioner 随机分区
package hadoop.lean.partitioner;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
import java.util.Random;
/**
* 随机分区
*/
public class RandomPartitioner extends Partitioner<Text,IntWritable> {
Random r = new Random() ;
public int getPartition(Text text, IntWritable intWritable, int numPartitions) {
return r.nextInt(numPartitions);
}
}
5、App
* 数据倾斜解决办法需要二次作业
* 自定义分区类
package hadoop.lean.partitioner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
/**
* 数据倾斜解决办法需要二次作业
* 自定义分区类
*/
public class App {
public static void main(String[] args) throws Exception {
args = new String[]{"d:/java/mr/data/1.txt", "d:/java/mr/out1", "d:/java/mr/out2"} ;
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
if(fs.exists(new Path(args[1]))){
fs.delete(new Path(args[1]),true);
}
Job job = Job.getInstance(conf);
job.setJobName("WordCount-1");
job.setJarByClass(App.class);
job.setMapperClass(DataLeanMapper1.class);
job.setReducerClass(DataLeanReducer1.class);
//添加输入路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//设置mapreduce输出
job.setPartitionerClass(RandomPartitioner.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(3);
//第一个阶段(job)
if(job.waitForCompletion(true)){
job = Job.getInstance(conf);
job.setJobName("WordCount-2");
job.setJarByClass(App.class);
job.setMapperClass(DataLeanMapper2.class);
job.setReducerClass(DataLeanReducer1.class);
//添加输入路径
FileInputFormat.addInputPath(job, new Path(args[1]));
//设置输出路径
FileOutputFormat.setOutputPath(job, new Path(args[2]));
//第一次的输出是第二次的输入,首次输出的key - value
job.setInputFormatClass(KeyValueTextInputFormat.class);
//第二次哈希分区
job.setPartitionerClass(HashPartitioner.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(3);
job.waitForCompletion(true);
}
}
}