1、partion的作用
Partion作用主要是对map处理的数据进行分区,可以解决数据倾斜的问题。
2、如果没有定义partitioner,那数据在被送达reducer前是如何被分区的?
hadoop有一个默认的分区类,HashPartioer类,通过对输入的k2去hash值来确认map输出的k2,v2送到哪一个reduce中去执行。
3、代码体现
public class ProvincePartitioner extends Partitioner<Text, FlowBean>{
private static HashMap<String, Integer> provinceMap = new HashMap<String, Integer>();
static {
provinceMap.put("135",0);
provinceMap.put("136",1);
provinceMap.put("137",2);
provinceMap.put("138",3);
provinceMap.put("139",4);
}
@Override
public int getPartition(Text key, FlowBean value, int numPartitions) {
Integer province =null;
province = provinceMap.get(key.toString().substring(0, 3));
if(province == null){
province = 5;
}
return province;
}
}
public class FlowBeanRunner {
static class FlowBeanMapper extends Mapper<LongWritable , Text , Text, FlowBean>{
protected void map(LongWritable key , Text value , Context context) throws IOException, InterruptedException{
String line = value.toString();
String[] fields = StringUtils.split(line , "\t");
String phone = fields[1];
//拿到上行流量字段值
long up_flow = Long.parseLong(fields[fields.length-3]);
//拿到下行流量字段值
long d_flow = Long.parseLong(fields[fields.length-2]);
//将上下行流量封装到flowBean中去
FlowBean flowBean = new FlowBean(up_flow, d_flow);
context.write(new Text(phone),flowBean);
}
}
static class FlowBeanReduce extends Reducer<Text, FlowBean,Text, FlowBean>{
protected void reduce (Text key , Iterable<FlowBean> values , Context context) throws IOException, InterruptedException{
long sum_upflow = 0;
long sum_dflow = 0;
for(FlowBean bean : values){
sum_upflow += bean.getUpflow();
sum_dflow += bean.getDflow();
}
FlowBean resultBean = new FlowBean(sum_upflow,sum_dflow);
context.write(key, resultBean);
}
}
public static void main(String[] args) throws Exception, IOException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(FlowBeanRunner.class);
job.setMapperClass(FlowBeanMapper.class);
job.setReducerClass(FlowBeanReduce.class);
//指定自定义的partitioner类,替换掉框架默认的HashPartitioner
job.setPartitionerClass(ProvincePartitioner.class);
//指定reduce task数量,跟ProvincePartitioner的分区数匹配
job.setNumReduceTasks(6);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//要处理的数据所在的path
//指定文件夹即可,该文件夹下的所有文件都会被处理
FileInputFormat.setInputPaths(job, new Path("/home/hadoop/Desktop/inputflow"));
//处理完得到的结果输出的path
FileOutputFormat.setOutputPath(job, new Path("/home/hadoop/Desktop/outputflow"));
job.waitForCompletion(true);
}
}