1.txt:
hello tom1
hello tom2
hello tom3
hello tom4
hello tom5
hello tom6
hello tom7
hello tom8
hello tom9
hello tom10
2.txt
hello tom11
hello tom12
hello tom13
hello tom14
hello tom15
hello tom16
hello tom17
hello tom18
hello tom19
hello tom20
3.txt
hello tom21
hello tom22
hello tom23
hello tom24
hello tom25
hello tom26
hello tom27
hello tom28
hello tom29
hello tom30
先写一个能产生数据倾斜的MapperReduce代码,如下:
map端:
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* Created on 2017/3/16.
*/
public class WCSkewMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] arr = value.toString().split(" ");
Text keyOut = new Text();
IntWritable valueOut = new IntWritable();
for (String s : arr) {
keyOut.set(s);
valueOut.set(1);
context.write(keyOut, valueOut);
}
}
}
reduce 端:
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * Created on 2017/3/16. */ public class WCSkewReducer extends Reducer<Text,IntWritable,Text,IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count =0 ; for(IntWritable iw : values){ count = count + iw.get(); } context.write(key,new IntWritable(count)); } }
App端: import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; /** * Created on 2017/3/16. */ public class WCSkewApp { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("fs.defaultFS","file:///"); Job job = Job.getInstance(conf); //设置job 的各种属性 job.setJobName("WCAkewApp"); job.setJarByClass(WCSkewApp.class); job.setInputFormatClass(TextInputFormat.class); //添加输入路径 FileInputFormat.addInputPath(job,new Path("g:/comp/skew")); FileOutputFormat.setOutputPath(job,new Path("g:/comp/out")); //设置合成类 job.setCombinerClass(WCSkewReducer.class); //设置任务类 job.setMapperClass(WCSkewMapper.class); job.setReducerClass(WCSkewReducer.class); //reduce任务数 job.setNumReduceTasks(4); //设置kv类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.waitForCompletion(true); } }
执行以上代码,则会在g:/comp/out/下产生数据的数据。30个hello 都进到一个reduce 执行,以下为解决办法:
利用随机分区解决:
map端 import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * Created on 2017/3/16. */ public class WCSkewMapper extends Mapper<LongWritable,Text,Text,IntWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] arr = value.toString().split(" "); Text keyOut = new Text(); IntWritable valueOut = new IntWritable(); for (String s : arr) { keyOut.set(s); valueOut.set(1); context.write(keyOut, valueOut); } } }
reduce端 import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * Created on 2017/3/16. */ public class WCSkewReducer extends Reducer<Text,IntWritable,Text,IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count =0 ; for(IntWritable iw : values){ count = count + iw.get(); } context.write(key,new IntWritable(count)); } }
RandomPartitioner 端 import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Partitioner; import java.util.Random; /** * Created on 2017/3/18. */ public class RandomPartitioner extends Partitioner<Text,IntWritable>{ public int getPartition(Text text, IntWritable intWritable, int numPartitioner) { return new Random().nextInt(numPartitioner); } }
App 端 import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * Created on 2017/3/16. */ public class WCSkewApp { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("fs.defaultFS","file:///"); Job job = Job.getInstance(conf); //设置job 的各种属性 job.setJobName("WCAkewApp"); job.setJarByClass(WCSkewApp.class); job.setInputFormatClass(TextInputFormat.class); //添加输入路径 FileInputFormat.addInputPath(job,new Path("g:/comp/skew")); FileOutputFormat.setOutputPath(job,new Path("g:/comp/out")); //设置合成类 job.setPartitionerClass(RandomPartitioner.class);//设置分区类 job.setCombinerClass(WCSkewReducer.class); //设置任务类 job.setMapperClass(WCSkewMapper.class); job.setReducerClass(WCSkewReducer.class); //reduce任务数 job.setNumReduceTasks(4); //设置kv类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.waitForCompletion(true); } }
利用随机分区,使hello分散在不同的reduceTask上进行计算,但执行到这步还没结束,因为此时产生的结果不是我想要的,因为hello是分散在不同的part-上,我们真正要的结果是每个单词出现的次数,所以我们还要进行一次Job(Mapper Reduce)任务, 如下:
map 端 import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * Created on 2017/3/18. */ public class WCSkewMapper extends Mapper<LongWritable,Text,Text,IntWritable>{ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] arr = value.toString().split("\t"); context.write(new Text(arr[0]), new IntWritable(Integer.parseInt(arr[1]))); } }
reduce 端 import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * Created on 2017/3/16. */ public class WCSkewReducer extends Reducer<Text,IntWritable,Text,IntWritable>{ protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for (IntWritable iw : values) { count = count + iw.get(); } context.write(key, new IntWritable(count)); } }
执行此job任务之后,得到的hello 为其总数,即为我们想要的 此时数据倾斜问题得到解决。。App端 import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** *解决数据倾斜问题 */ public class WCSkewApp { public static void main(String[] args) throws Exception { //加载配置文件 Configuration conf = new Configuration(); //设置本地文件系统 conf.set("fs.defaultFS", "file:///"); //创建job对象 Job job = Job.getInstance(conf); //设置job的属性 job.setJobName("WCSkewApp"); job.setJarByClass(WCSkewApp.class); //设置文件输入格式 job.setInputFormatClass(TextInputFormat.class); //设置Mapper Reduce类 job.setMapperClass(WCSkewMapper.class); job.setReducerClass(WCSkewReducer.class); //设置map reduce 的kv 建输出类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //设置文件输入和输出路径 FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00000")); FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00001")); FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00002")); FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00003")); FileOutputFormat.setOutputPath(job, new Path("g:/comp/out8")); //设置reduce 个数 job.setNumReduceTasks(4); job.waitForCompletion(true); } }
在上述解决数据倾斜问题的第二个job任务中,在App端的输入格式还可以设置成为
KeyValueTextInputFormat泛型为<Text,Text>(本来是TextInputformat),需要注意的是此时map 端的输入输出均为Text 类型
代码如下:
map import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * Created on 2017/3/18. */ public class WCSkewMapper extends Mapper<Text,Text,Text,IntWritable>{ protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { context.write(key, new IntWritable(Integer.parseInt(value.toString()))); } }
reduce import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * Created on 2017/3/16. */ public class WCSkewReducer extends Reducer<Text,IntWritable,Text,IntWritable>{ protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for (IntWritable iw : values) { count = count + iw.get(); } context.write(key, new IntWritable(count)); } }
App import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** *解决数据倾斜问题 */ public class WCSkewApp { public static void main(String[] args) throws Exception { //加载配置文件 Configuration conf = new Configuration(); //设置本地文件系统 conf.set("fs.defaultFS", "file:///"); //创建job对象 Job job = Job.getInstance(conf); //设置job的属性 job.setJobName("WCSkewApp"); job.setJarByClass(WCSkewApp.class); //设置文件输入格式 job.setInputFormatClass(KeyValueTextInputFormat.class); //设置Mapper Reduce类 job.setMapperClass(WCSkewMapper.class); job.setReducerClass(WCSkewReducer.class); //设置map reduce 的kv 建输出类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //设置文件输入和输出路径 FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00000")); FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00001")); FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00002")); FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00003")); FileOutputFormat.setOutputPath(job, new Path("g:/comp/out8")); //设置reduce 个数 job.setNumReduceTasks(4); job.waitForCompletion(true); } }