MapReduce： Partition

最新推荐文章于 2024-04-17 15:52:39 发布

北京小辉

最新推荐文章于 2024-04-17 15:52:39 发布

阅读量1.1k

点赞数 2

分类专栏：【大数据】MapReduce 文章标签： mapreduce

本文链接：https://blog.csdn.net/silentwolfyh/article/details/50894860

版权

【大数据】MapReduce 专栏收录该内容

9 篇文章 3 订阅

订阅专栏

1、partion的作用

Partion作用主要是对map处理的数据进行分区，可以解决数据倾斜的问题。

2、如果没有定义partitioner，那数据在被送达reducer前是如何被分区的？

hadoop有一个默认的分区类，HashPartioer类，通过对输入的k2去hash值来确认map输出的k2,v2送到哪一个reduce中去执行。

3、代码体现

public class ProvincePartitioner extends Partitioner<Text, FlowBean>{

	private static HashMap<String, Integer> provinceMap = new HashMap<String, Integer>();
	
	static {
		
		provinceMap.put("135",0);
		provinceMap.put("136",1);
		provinceMap.put("137",2);
		provinceMap.put("138",3);
		provinceMap.put("139",4);
		
	}
	
	
	@Override
	public int getPartition(Text key, FlowBean value, int numPartitions) {
	
		Integer province =null;
		province = provinceMap.get(key.toString().substring(0, 3));
		
		if(province == null){
			province = 5;
		}
		
		return province;
	}

}

public class FlowBeanRunner {

	static class FlowBeanMapper extends Mapper<LongWritable , Text , Text, FlowBean>{
		
		protected void map(LongWritable key , Text value , Context context) throws IOException, InterruptedException{
			String line = value.toString();
			String[] fields = StringUtils.split(line , "\t");
			String phone = fields[1];
			
			//拿到上行流量字段值
			long up_flow = Long.parseLong(fields[fields.length-3]);
			//拿到下行流量字段值
			long d_flow = Long.parseLong(fields[fields.length-2]);
			//将上下行流量封装到flowBean中去
			FlowBean flowBean = new FlowBean(up_flow, d_flow);
			
			context.write(new Text(phone),flowBean);
		}
		
	}
	
	
	static class FlowBeanReduce extends Reducer<Text, FlowBean,Text, FlowBean>{
		
		protected void reduce (Text key , Iterable<FlowBean> values , Context context) throws IOException, InterruptedException{
			
			long sum_upflow = 0;
			long sum_dflow  = 0;
			
			for(FlowBean bean : values){
				sum_upflow += bean.getUpflow();
				sum_dflow += bean.getDflow();
			}
			
			FlowBean resultBean = new FlowBean(sum_upflow,sum_dflow);
			
			context.write(key, resultBean);
		}
		
	}
	
	
	public static void main(String[] args) throws Exception, IOException, InterruptedException {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(FlowBeanRunner.class);
		
		job.setMapperClass(FlowBeanMapper.class);
		job.setReducerClass(FlowBeanReduce.class);
		
		//指定自定义的partitioner类，替换掉框架默认的HashPartitioner
		job.setPartitionerClass(ProvincePartitioner.class);

		//指定reduce task数量，跟ProvincePartitioner的分区数匹配
		job.setNumReduceTasks(6);
		
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		//要处理的数据所在的path
		//指定文件夹即可，该文件夹下的所有文件都会被处理
		FileInputFormat.setInputPaths(job, new Path("/home/hadoop/Desktop/inputflow"));
		
		//处理完得到的结果输出的path
		FileOutputFormat.setOutputPath(job, new Path("/home/hadoop/Desktop/outputflow"));
		
		job.waitForCompletion(true);
	}

}

北京小辉微信公众号

在这里插入图片描述