hadoop之Partitioner编程

Mapreduce默认的partitioner是HashPartitioner。除了这个mapreduce还提供了3种partitioner。Partitioner是partitioner的基类,如果需要定制partitioner也需要继承该类。

1.实现分区的步骤:

1.1先分析一下具体的业务逻辑(如根据地区进行分区),确定大概有多少个分区;
1.2首先书写一个类,它要继承org.apache.hadoop.mapreduce.Partitioner这个类;
1.3重写public int getPartition这个方法,根据具体逻辑,读数据库或者配置返回相同的数字;
1.4在main方法中设置Partioner的类,job.setPartitionerClass(DataPartitioner.class);
1.5设置Reducer的数量,job.setNumReduceTasks(6)。

2.排序MR默认是按key2进行排序的,如果想自定义排序规则,被排序的对象要实现WritableComparable接口,在compareTo方法中实现排序规则,然后将这个对象当做k2,即可完成排序。


需求分析:

按照不同的手机号码的运营商将号码写入到不同的分区。


代码示例:

实体类:

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class DataInfo implements Writable{

	private String tel;//电话号码
	private long upPayLoad;//上传流量
	private long downPayLoad;//下载流量
	private long totalPayLoad;//总流量
	
	public DataInfo(){}
	
	public DataInfo(String tel, long upPayLoad, long downPayLoad) {
		this.tel = tel;
		this.upPayLoad = upPayLoad;
		this.downPayLoad = downPayLoad;
		this.totalPayLoad = upPayLoad + downPayLoad;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(tel);
		out.writeLong(upPayLoad);
		out.writeLong(downPayLoad);
		out.writeLong(totalPayLoad);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.tel = in.readUTF();
		this.upPayLoad = in.readLong();
		this.downPayLoad = in.readLong();
		this.totalPayLoad = in.readLong();
		
	}

	@Override
	public String toString() {
		return upPayLoad + "\t" + downPayLoad + "\t" + totalPayLoad;
	}

	public String getTel() {
		return tel;
	}

	public void setTel(String tel) {
		this.tel = tel;
	}

	public long getUpPayLoad() {
		return upPayLoad;
	}

	public void setUpPayLoad(long upPayLoad) {
		this.upPayLoad = upPayLoad;
	}

	public long getDownPayLoad() {
		return downPayLoad;
	}

	public void setDownPayLoad(long downPayLoad) {
		this.downPayLoad = downPayLoad;
	}

	public long getTotalPayLoad() {
		return totalPayLoad;
	}

	public void setTotalPayLoad(long totalPayLoad) {
		this.totalPayLoad = totalPayLoad;
	}

}

核心类:

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DataCount {

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(DataCount.class);
		
		job.setMapperClass(DCMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(DataInfo.class);
		
		job.setReducerClass(DCReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(DataInfo.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.setPartitionerClass(DCPartitioner.class);
		
		job.setNumReduceTasks(Integer.parseInt(args[2]));//设置reduce的数量
		
		
		job.waitForCompletion(true);

	}
	//Map
	public static class DCMapper extends Mapper<LongWritable, Text, Text, DataInfo>{
		
		private Text k = new Text();
		
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, DataInfo>.Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String[] fields = line.split("\t");
			String tel = fields[1];
			long up = Long.parseLong(fields[8]);
			long down = Long.parseLong(fields[9]);
			DataInfo dataInfo = new DataInfo(tel,up,down);
			k.set(tel);
			context.write(k, dataInfo);

		}
		
	}
	public static class DCReducer extends Reducer<Text, DataInfo, Text, DataInfo>{
		
		@Override
		protected void reduce(Text key, Iterable<DataInfo> values,
				Reducer<Text, DataInfo, Text, DataInfo>.Context context)
				throws IOException, InterruptedException {
			long up_sum = 0;
			long down_sum = 0;
			for(DataInfo d : values){
				up_sum += d.getUpPayLoad();
				down_sum += d.getDownPayLoad();
			}
			DataInfo dataInfo = new DataInfo("",up_sum,down_sum);
			
			context.write(key, dataInfo);
		}
		
	}
	//分区块
	public static class DCPartitioner extends  Partitioner<Text, DataInfo>{
		
		private static Map<String,Integer> provider = new HashMap<String,Integer>();
		
		static{
			provider.put("138", 1);
			provider.put("139", 1);
			provider.put("152", 2);
			provider.put("153", 2);
			provider.put("182", 3);
			provider.put("183", 3);
		}
		@Override
		public int getPartition(Text key, DataInfo value, int numPartitions) {
			//向数据库或配置信息 读写
			String tel_sub = key.toString().substring(0,3);
			Integer count = provider.get(tel_sub);
			if(count == null){
				count = 0;
			}
			return count;
		}
		
	}

}


打成jar包,并执行:

hadoop jar jar包路径 main函数执行路径 源文件路径(参数1) 目标文件路径(参数二) 启动reduce的个数(参数3)

小结:

1、如果启动的reduce的个数小于分区数,那么程序会报错;

2、如果启动的reduce的个数大于分区数,那么结果文件的个数就等于reduce的个数,只不过多出的结果文件的大小是0KB。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值