一、为什么要分区?
1、默认数据分区规则
默认分区是根据key的hashCode对ReduceTasks个数取模得到的。用户没法控制哪个key存储到哪个分区。
2、分区总结
(1)如果ReduceTask的数量> getPartition的结果数,则会多产生几个空的输出文件part-r-000xx;
(2)如果1<ReduceTask的数量<getPartition的结果数,则有一部分分区数据无处安放,会Exception;
(3)如果ReduceTask的数量=1,则不管MapTask端输出多少个分区文件,最终结果都交给这一个ReduceTask,最终也就只会产生一个结果文件 part-r-00000;
(4)分区号必须从零开始,逐一累加。
3、案例分析
例如:假设自定义分区数为5,则
(1)job.setNumReduceTasks(1); 会正常运行,只不过会产生一个输出文件
(2)job.setNumReduceTasks(2); 会报错
(3)job.setNumReduceTasks(6); 大于5,程序会正常运行,会产生空文件
4、自定义分区步骤
(1)自定义类继承Partitioner,重写getPartition()方法
public class CustomPartitioner extends Partitioner<Text, FlowBean> {
@Override
public int getPartition(Text key, FlowBean value, int numPartitions) {
// 控制分区代码逻辑
… …
return partition;
}
}
(2)在Job驱动中,设置自定义Partitioner
job.setPartitionerClass(CustomPartitioner.class);
(3)自定义Partition后,要根据自定义Partitioner的逻辑设置相应数量的ReduceTask
job.setNumReduceTasks(5);
二、案例实操
1、需求分析
有如下一批数据需要按照姓名,将按照姓名分区,将相同姓名的数据写出到同一个文件上,默认分区规则肯定是无法实现了。
id 姓名 课程 分数
1 zhangsan yw 99
2 lisi yw 98
3 wangwu yw 77
4 zhangsan sx 78
5 lisi sx 94
6 wangwu sx 56
7 zhangsan yy 89
8 lisi yy 43
9 wangwu yy 23
2、编写partition,定义分区规则
package com.cjy.mr.partition;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class ProvincePartitioner extends Partitioner<Text, IntWritable> {
@Override
public int getPartition(Text text, IntWritable intWritable, int i) {
//获取输入key
String s = text.toString();
//判断人名
if("zhangsan".equals(s)){
return 0;
}else if("lisi".equals(s)){
return 1;
}else{
return 2;
}
}
}
3、编写mapper
package com.cjy.mr.partition;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 编写wordcount map类:完成数据输入
*
* FileInputFormat默认实现是TextInputFormat,所以是一行一行读入数据,key是偏移量,v是读取行字符串
*/
public class PartitionMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
// 输出数据格式,也就是reduce接受的格式
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取当前行文本
String line = value.toString();
//按照 \t 分割字符
String[] words = line.split("\t");
k.set(words[1]);
v.set(Integer.parseInt(words[3]));
context.write(k,v);
}
}
4、编写Reducer
package com.cjy.mr.partition;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class PartitionReducer extends Reducer<Text,IntWritable,Text, IntWritable> {
// IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
for (IntWritable count : values) {
context.write(key,count);
}
}
}
5、编写Driver
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class PartitionDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// 1 获取配置信息以及封装任务
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setNumReduceTasks(2);
// 2 设置jar加载路径
job.setJarByClass(PartitionDriver.class);
// 3 设置map和reduce类
job.setMapperClass(PartitionMapper.class);
job.setReducerClass(PartitionReducer.class);
// 4 设置map输出
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5 设置最终输出kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// // 6 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path("/Users/chenjunying/Downloads/input/c2.txt"));
FileOutputFormat.setOutputPath(job, new Path("/Users/chenjunying/Downloads/wcoutput1/"));
// 8 指定自定义数据分区
job.setPartitionerClass(ProvincePartitioner.class);
// 9 同时指定相应数量的reduce task
job.setNumReduceTasks(3);
// 7 提交
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
6、测试结果