Hadoop——自定义数据类型，实现WritableComparable, 并且分组，排序

最新推荐文章于 2024-08-10 16:30:06 发布

今晚打酱油8

最新推荐文章于 2024-08-10 16:30:06 发布

阅读量5k

点赞数 2

分类专栏：大数据 hadoop 和 spark

本文链接：https://blog.csdn.net/xj626852095/article/details/52675874

版权

大数据 hadoop 和 spark 专栏收录该内容

23 篇文章 0 订阅

订阅专栏

http://blog.csdn.net/u014432433/article/details/51104026

1. 在进行mapreduce编程时key键往往用于分组或排序，当我们在进行这些操作时Hadoop内置的key键数据类型不能满足需求时，

或针对用例优化自定义数据类型可能执行的更好。因此可以通过实现org.apache.hadoop.io.WritableComparable接口定义一个自定义的WritableComparable类型，并使其作为mapreduce计算的key类型。

2.自定义Hadoop key类型。
1.Hadoop mapreduce的key类型往往用于进行相互比较，可以达到进行相互比较来满足排序的目的。
2.Hadoop Writable数据类型实现了WritableComparable<T>接口，并增加了CompareTo()方法。
CompaeTo()方法的返回值有三种类型。负整数、0、正整数分别对应小于、等于、大于被比较对象。

3. 实例先统计一个手机号码的上下行及总流量，再对结果按总流量排序

日志文件 *.dat

1363157985066 	13726230503	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157995052 	13826544101	5C-0E-8B-C7-F1-E0:CMCC	120.197.40.4			4	0	264	0	200
1363157991076 	13926435656	20-10-7A-28-CC-0A:CMCC	120.196.100.99			2	4	132	1512	200
1363154400022 	13926251106	5C-0E-8B-8B-B1-50:CMCC	120.197.40.4			4	0	240	0	200
1363157993044 	18211575961	94-71-AC-CD-E6-18:CMCC-EASY	120.196.100.99	iface.qiyi.com	视频网站	15	12	1527	2106	200
1363157995074 	84138413	5C-0E-8B-8C-E8-20:7DaysInn	120.197.40.4	122.72.52.12		20	16	4116	1432	200
1363157993055 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200
1363157995033 	15920133257	5C-0E-8B-C7-BA-20:CMCC	120.197.40.4	sug.so.360.cn	信息安全	20	20	3156	2936	200
1363157983019 	13719199419	68-A1-B7-03-07-B1:CMCC-EASY	120.196.100.82			4	0	240	0	200
1363157984041 	13660577991	5C-0E-8B-92-5C-20:CMCC-EASY	120.197.40.4	s19.cnzz.com	站点统计	24	9	6960	690	200
1363157973098 	15013685858	5C-0E-8B-C7-F7-90:CMCC	120.197.40.4	rank.ie.sogou.com	搜索引擎	28	27	3659	3538	200
1363157986029 	15989002119	E8-99-C4-4E-93-E0:CMCC-EASY	120.196.100.99	www.umeng.com	站点统计	3	3	1938	180	200
1363157992093 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			15	9	918	4938	200
1363157986041 	13480253104	5C-0E-8B-C7-FC-80:CMCC-EASY	120.197.40.4			3	3	180	180	200
1363157984040 	13602846565	5C-0E-8B-8B-B6-00:CMCC	120.197.40.4	2052.flash2-http.qq.com	综合门户	15	12	1938	2910	200
1363157995093 	13922314466	00-FD-07-A2-EC-BA:CMCC	120.196.100.82	img.qfc.cn		12	12	3008	3720	200
1363157982040 	13502468823	5C-0A-5B-6A-0B-D4:CMCC-EASY	120.196.100.99	y0.ifengimg.com	综合门户	57	102	7335	110349	200
1363157986072 	18320173382	84-25-DB-4F-10-1A:CMCC-EASY	120.196.100.99	input.shouji.sogou.com	搜索引擎	21	18	9531	2412	200
1363157990043 	13925057413	00-1F-64-E1-E6-9A:CMCC	120.196.100.55	t3.baidu.com	搜索引擎	69	63	11058	48243	200
1363157988072 	13760778710	00-FD-07-A4-7B-08:CMCC	120.196.100.82			2	2	120	120	200
1363157985066 	13726238888	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157993055 	13560436666	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200

FlowBean.java 自定义类型

package com.kevin.model;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class FlowBean implements WritableComparable<FlowBean> {
	
	private String phoneNbr;
	private long up_flow;
	private long d_flow;
	private long sum_flow;
	
	
	public void set(String phoneNbr, long up_flow, long d_flow){
		this.phoneNbr = phoneNbr;
		this.up_flow = up_flow;
		this.d_flow = d_flow;
		this.sum_flow = up_flow + d_flow;
	}
	
	/**
	 * 序列化，将数据字段以字节流写出去
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(this.phoneNbr);
		out.writeLong(this.up_flow);
		out.writeLong(this.d_flow);
		out.writeLong(this.sum_flow);
	}
	
	/**
	 * 反序列化，从字节流中读出各个数据字段 读出的顺序应该跟序列化时写入的顺序保持一致
	 */
	@Override
	public void readFields(DataInput in) throws IOException {
		this.phoneNbr = in.readUTF();
		this.up_flow = in.readLong();
		this.d_flow = in.readLong();
		this.sum_flow = in.readLong();
	}

	@Override
	public int compareTo(FlowBean o) {		
		return  this.sum_flow > o.getSum_flow() ? -1 : 1 ;
	}
	
	@Override
	public String toString() {		 
		return up_flow + "\t" + d_flow + "\t" + sum_flow;
	}

	public String getPhoneNbr() {
		return phoneNbr;
	}

	public void setPhoneNbr(String phoneNbr) {
		this.phoneNbr = phoneNbr;
	}

	public long getUp_flow() {
		return up_flow;
	}

	public void setUp_flow(long up_flow) {
		this.up_flow = up_flow;
	}

	public long getD_flow() {
		return d_flow;
	}

	public void setD_flow(long d_flow) {
		this.d_flow = d_flow;
	}

	public long getSum_flow() {
		return sum_flow;
	}

	public void setSum_flow(long sum_flow) {
		this.sum_flow = sum_flow;
	}	

}

AreaPartitioner.java 定义分组继承Partitioner 并实现分组的方法，按手机号码分组

package com.kevin.partitioner;

import java.util.HashMap;

import org.apache.hadoop.mapreduce.Partitioner;

public class AreaPartitioner<KEY, VALUE> extends Partitioner<KEY, VALUE>{
	
	private static HashMap<String, Integer> areaMap =  new HashMap<>();
	
	static {
		areaMap.put("136", 0);
		areaMap.put("137", 1);
		areaMap.put("138", 2);
		areaMap.put("139", 3);
	}
	
	@Override
	public int getPartition(KEY key, VALUE value, int numPartitions) {		
		Integer provinceCode = areaMap.get(key.toString().substring(0,3));		
		return provinceCode==null?4:provinceCode;
	}

}

FlowCount.java 对手机号码进行流量统计

package com.kevin.mapreducedemo2;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import com.kevin.model.FlowBean;
import com.kevin.partitioner.AreaPartitioner;

//hadoop自己实现的序列化机制跟jdk有区别： 比jdk更精简

public class FlowCount {
	
	public static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{		
		private FlowBean flowBean = new FlowBean();
		@Override
		protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
			// 拿到一行数据
			String line = value.toString();
			// 切分字段
			String[] fields = StringUtils.split(line, "\t");
			// 拿到我们需要的若干个字段
			String phoneNbr = fields[1];
			long up_flow = Long.parseLong(fields[fields.length - 3]);
			long d_flow = Long.parseLong(fields[fields.length - 2]);
			// 将数据封装到一个flowbean中
			flowBean.set(phoneNbr, up_flow, d_flow);
			context.write(new Text(phoneNbr), flowBean);			
		}
	}
	
	public static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
		private FlowBean flowBean = new FlowBean();		
		@Override
		protected void reduce(Text key, Iterable<FlowBean> values,Context context) throws IOException, InterruptedException {
			long up_flow_sum = 0;
			long d_flow_sum = 0;
			for(FlowBean flowBean : values){
				up_flow_sum += flowBean.getUp_flow();
				d_flow_sum += flowBean.getD_flow();
			}
			flowBean.set(key.toString(), up_flow_sum, d_flow_sum);
			context.write(key, flowBean);
		}
	}
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf,"flowjob");		
		job.setJarByClass(FlowCount.class);
		
		job.setMapperClass(FlowCountMapper.class);
		job.setReducerClass(FlowCountReducer.class);
		
		/**
		 * 加入自定义分区定义 ： AreaPartitioner
		 */
		job.setPartitionerClass(AreaPartitioner.class);
		
		/**
		 * 设置reduce task的数量，要跟AreaPartitioner返回的partition个数匹配
		 * 如果reduce task的数量比partitioner中分组数多，就会产生多余的几个空文件
		 * 如果reduce task的数量比partitioner中分组数少，就会发生异常，因为有一些key没有对应reducetask接收
		 * (如果reduce task的数量为1，也能正常运行，所有的key都会分给这一个reduce task)
		 * reduce task 或 map task 指的是，reuder和mapper在集群中运行的实例
		 */
		job.setNumReduceTasks(5);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowBean.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.1.108:9000/flow_count_demo/data-files/"));
		FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.1.108:9000/flow_count_demo/data-out/"));
		
		job.waitForCompletion(true);
		
	}
	
	
	
	
	
	
}

FlowCountSort.java 对结果按总量排序

package com.kevin.mapreducedemo2;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import com.kevin.mapreducedemo2.FlowCount.FlowCountMapper;
import com.kevin.mapreducedemo2.FlowCount.FlowCountReducer;
import com.kevin.model.FlowBean;

public class FlowCountSort {
	
	public static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, NullWritable>{
		private FlowBean bean = new FlowBean();
		@Override
		protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
			String line = value.toString();			
			String[] fields = StringUtils.split(line, "\t");			
			String phoneNbr = fields[0];
			long up_flow = Long.parseLong(fields[1]);
			long d_flow = Long.parseLong(fields[2]);			
			bean.set(phoneNbr, up_flow, d_flow);
			
			context.write(bean, NullWritable.get());
		}
	}
	
	public static class FlowCountSortReducer extends Reducer<FlowBean, NullWritable, Text, FlowBean>{
		@Override
		protected void reduce(FlowBean bean, Iterable<NullWritable> values,Context context) throws IOException, InterruptedException {
			context.write(new Text(bean.getPhoneNbr()), bean);
		}
	}
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf,"flowjob");		
		job.setJarByClass(FlowCountSort.class);
		
		job.setMapperClass(FlowCountSortMapper.class);
		job.setReducerClass(FlowCountSortReducer.class);
		
		job.setMapOutputKeyClass(FlowBean.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.1.108:9000/flow_count_demo/data-out/"));
		FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.1.108:9000/flow_count_demo/data-out2/"));
		
		job.waitForCompletion(true);
	}
	
}