hadoop 序列化和排序，自定义分区

最新推荐文章于 2021-03-19 15:53:47 发布

爱笑的T_T

最新推荐文章于 2021-03-19 15:53:47 发布

阅读量779

点赞数

分类专栏： hadoop

hadoop 专栏收录该内容

23 篇文章 0 订阅

订阅专栏

pom.xml

<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
		<hadoop.version>2.7.3</hadoop.version>
	</properties>
	<dependencies>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>${hadoop.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-common</artifactId>
			<version>${hadoop.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-hdfs</artifactId>
			<version>${hadoop.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-mapreduce-client-core</artifactId>
			<version>${hadoop.version}</version>
		</dependency>
	</dependencies>

流量统计

13726230503	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
13826544101	5C-0E-8B-C7-F1-E0:CMCC	120.197.40.4			4	0	264	0	200
13926435656	20-10-7A-28-CC-0A:CMCC	120.196.100.99			2	4	132	1512	200
13926251106	5C-0E-8B-8B-B1-50:CMCC	120.197.40.4			4	0	240	0	200
18211575961	94-71-AC-CD-E6-18:CMCC-EASY	120.196.100.99	iface.qiyi.com	视频网站	15	12	1527	2106	200
84138413	5C-0E-8B-8C-E8-20:7DaysInn	120.197.40.4	122.72.52.12		20	16	4116	1432	200
13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200
15920133257	5C-0E-8B-C7-BA-20:CMCC	120.197.40.4	sug.so.360.cn	信息安全	20	20	3156	2936	200
13719199419	68-A1-B7-03-07-B1:CMCC-EASY	120.196.100.82			4	0	240	0	200
13660577991	5C-0E-8B-92-5C-20:CMCC-EASY	120.197.40.4	s19.cnzz.com	站点统计	24	9	6960	690	200
15013685858	5C-0E-8B-C7-F7-90:CMCC	120.197.40.4	rank.ie.sogou.com	搜索引擎	28	27	3659	3538	200
15989002119	E8-99-C4-4E-93-E0:CMCC-EASY	120.196.100.99	www.umeng.com	站点统计	3	3	1938	180	200
13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			15	9	918	4938	200
13480253104	5C-0E-8B-C7-FC-80:CMCC-EASY	120.197.40.4			3	3	180	180	200
13602846565	5C-0E-8B-8B-B6-00:CMCC	120.197.40.4	2052.flash2-http.qq.com	综合门户	15	12	1938	2910	200
13922314466	00-FD-07-A2-EC-BA:CMCC	120.196.100.82	img.qfc.cn		12	12	3008	3720	200
13502468823	5C-0A-5B-6A-0B-D4:CMCC-EASY	120.196.100.99	y0.ifengimg.com	综合门户	57	102	7335	110349	200
18320173382	84-25-DB-4F-10-1A:CMCC-EASY	120.196.100.99	input.shouji.sogou.com	搜索引擎	21	18	9531	2412	200
13925057413	00-1F-64-E1-E6-9A:CMCC	120.196.100.55	t3.baidu.com	搜索引擎	69	63	11058	48243	200
13760778710	00-FD-07-A4-7B-08:CMCC	120.196.100.82			2	2	120	120	200
13823070001	20-7C-8F-70-68-1F:CMCC	120.196.100.99			6	3	360	180	200
13600217502	00-1F-64-E2-E8-B1:CMCC	120.196.100.55			18	138	1080	186852	200

mapreduce编程要使用javaBean作为map或reduce的输入(输出)的key-value的参数类型要使用hadoop的序列化机制

hadoop的序列化机制
跟jdk自带的比较起来，更加精简，只传递对象中的数据，而不传递如继承结构等额外信息
要想让自定义的数据类型在hadoop集群中传递，需要实现hadoop的序列化接口Writable或者 WritableComparable<T>
自定义的数据类型bean实现了Writable接口后，要实现其中的两个方法
public void write(DataOutput out) throws IOException ----序列化，将数据写入字节流
以及
public void readFields(DataInput in) throws IOException ----反序列化，从字节流中读出数据
注意：
写入数据和读出数据的顺序和类型要保持一致

FlowBean.java

package com.test.hadoop.mr.flowcount;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class FlowBean implements WritableComparable<FlowBean> {

	private String phoneNbr;
	private long up_flow;
	private long d_flow;
	private long sum_flow;

	public FlowBean() {
	};

	public void set(String phoneNbr, long up_flow, long d_flow) {
		this.phoneNbr = phoneNbr;
		this.up_flow = up_flow;
		this.d_flow = d_flow;
		this.sum_flow = up_flow + d_flow;
	}
	

	public String getPhoneNbr() {
		return phoneNbr;
	}

	public void setPhoneNbr(String phoneNbr) {
		this.phoneNbr = phoneNbr;
	}

	public long getUp_flow() {
		return up_flow;
	}

	public void setUp_flow(long up_flow) {
		this.up_flow = up_flow;
	}

	public long getD_flow() {
		return d_flow;
	}

	public void setD_flow(long d_flow) {
		this.d_flow = d_flow;
	}

	public long getSum_flow() {
		return sum_flow;
	}

	public void setSum_flow(long sum_flow) {
		this.sum_flow = sum_flow;
	}

	/**
	 * 序列化，将数据字段以字节流写出去
	 */
	@Override
	public void write(DataOutput output) throws IOException {
		output.writeUTF(phoneNbr);
		output.writeLong(up_flow);
		output.writeLong(d_flow);
		output.writeLong(sum_flow);
	}

	/**
	 * 反序列化，从字节流中读出各个数据字段 读出的顺序应该跟序列化时写入的的顺序保持一致
	 */
	@Override
	public void readFields(DataInput input) throws IOException {
		phoneNbr = input.readUTF();
		up_flow = input.readLong();
		d_flow = input.readLong();
		sum_flow = input.readLong();
	}

	@Override
	public String toString() {

		return up_flow + "\t" + d_flow + "\t" + sum_flow;
	}

	@Override
	public int compareTo(FlowBean o) {
		return sum_flow > o.getSum_flow() ? -1 : 1;
	}

}

package com.test.hadoop.mr.flowcount;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class FlowCount {
	public static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
		private FlowBean flowBean = new FlowBean();

		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

			try {
				// 拿到一行数据
				String line = value.toString();
				// 切分字段
				String[] fields = StringUtils.split(line, "\t");

				// 拿到我需要的若干个字段
				String phoneNbr = fields[0];
				long up_flow = Long.parseLong(fields[fields.length - 3]);
				long d_flow = Long.parseLong(fields[fields.length - 2]);
				// 将数据封装到一个flowbean中
				flowBean.set(phoneNbr, up_flow, d_flow);

				// 以手机号为key,将流量数据输出去
				context.write(new Text(phoneNbr), flowBean);
			} catch (Exception e) {
				e.printStackTrace();
			}

		}
	}

	public static class FlowCouontReducer extends Reducer<Text, FlowBean, Text, FlowBean> {
		private FlowBean flowBean = new FlowBean();

		@Override
		protected void reduce(Text key, Iterable<FlowBean> values, Context context)
				throws IOException, InterruptedException {
			long up_flow_sum = 0;
			long d_flow_sum = 0;
			for (FlowBean flowBean : values) {
				up_flow_sum += flowBean.getUp_flow();
				d_flow_sum += flowBean.getD_flow();
			}

			flowBean.set(key.toString(), up_flow_sum, d_flow_sum);

			context.write(key, flowBean);
		}
	}

	public static void main(String[] args) throws Exception, IOException {
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(FlowCount.class);

		job.setMapperClass(FlowCountMapper.class);
		job.setReducerClass(FlowCouontReducer.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowBean.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);

		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);

		FileInputFormat.setInputPaths(job, new Path("D:/BaiduYunDownload/flow/srcData"));
		FileOutputFormat.setOutputPath(job, new Path("D:/BaiduYunDownload/flow/out"));

		job.waitForCompletion(true);
	}
}

自定义排序
hadoop的排序是在shuffle中完成的
排序的依据是map输出的key
要想实现自定义的排序，就要将需要排序的数据封装到key中传输，并且要将数据实现WritableComparable接口

总流量排序

以流量统计的输出结果作为流量排序的输入

FlowBean 实现compareTo方法

FlowCountSort.java

package com.test.hadoop.mr.flowcount;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class FlowCountSort {
	public static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, NullWritable> {
		FlowBean flowBean = new FlowBean();

		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			try {
				String line = value.toString();
				String[] fields = StringUtils.split(line, "\t");
				String phoneNbr = fields[0];

				long up_flow = Long.parseLong(fields[1]);
				long d_flow = Long.parseLong(fields[2]);

				flowBean.set(phoneNbr, up_flow, d_flow);
				context.write(flowBean, NullWritable.get());
			} catch (Exception e) {
				e.printStackTrace();
			}

		}
	}

	public static class FlowCountSortReducer extends Reducer<FlowBean, NullWritable, Text, FlowBean> {

		@Override
		protected void reduce(FlowBean bean, Iterable<NullWritable> values, Context context)
				throws IOException, InterruptedException {

			context.write(new Text(bean.getPhoneNbr()), bean);

		}
	}

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		job.setJarByClass(FlowCountSort.class);

		job.setMapperClass(FlowCountSortMapper.class);
		job.setReducerClass(FlowCountSortReducer.class);

		job.setMapOutputKeyClass(FlowBean.class);
		job.setMapOutputValueClass(NullWritable.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);

		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);

		FileInputFormat.setInputPaths(job, new Path("D:/BaiduYunDownload/flow/out"));
		FileOutputFormat.setOutputPath(job, new Path("D:/BaiduYunDownload/flow/out2"));

		job.waitForCompletion(true);

	}
}

实现自定义分区的步骤：
1.1先分析一下具体的业务逻辑，确定大概有多少个分区
1.2首先书写一个类，它要继承org.apache.hadoop.mapreduce.Partitioner这个类
1.3重写public int getPartition这个方法，根据具体逻辑，读数据库或者配置返回相同的数字
1.4在main方法中设置Partioner的类，job.setPartitionerClass(DataPartitioner.class);
1.5设置Reducer的数量，job.setNumReduceTasks(6);

AreaPartitioner

import java.util.HashMap;

import org.apache.hadoop.mapreduce.Partitioner;

public class AreaPartitioner<KEY, VALUE> extends Partitioner<KEY, VALUE>{

	private static HashMap<String, Integer> areaMap =  new HashMap<>();
	
	static {
		
		areaMap.put("136", 0);
		areaMap.put("137", 1);
		areaMap.put("138", 2);
		areaMap.put("139", 3);
		
	}
	
	
	
	@Override
	public int getPartition(KEY key, VALUE value, int numPartitions) {

		Integer provinceCode = areaMap.get(key.toString().substring(0,3));
		return provinceCode==null?4:provinceCode;
	}

	
	
}

public class FlowCountPartition {

public static class FlowCountPartitionMapper extends Mapper<LongWritable, Text, Text, FlowBean>{
	
	private FlowBean flowBean = new FlowBean();
	
	@Override
	protected void map(LongWritable key, Text value,Context context)
			throws IOException, InterruptedException {
		 
			// 拿到一行数据
			String line = value.toString();
			// 切分字段
			String[] fields = StringUtils.split(line, "\t");
			// 拿到我们需要的若干个字段
			String phoneNbr = fields[0];
			long up_flow = Long.parseLong(fields[fields.length - 3]);
			long d_flow = Long.parseLong(fields[fields.length - 2]);
			// 将数据封装到一个flowbean中
			flowBean.set(phoneNbr, up_flow, d_flow);

			// 以手机号为key，将流量数据输出去
			context.write(new Text(phoneNbr), flowBean);
		 
	}
	
}
	
	
	
	public static class FlowCountPartitionReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
		private FlowBean flowBean = new FlowBean();
		
		@Override
		protected void reduce(Text key, Iterable<FlowBean> values,Context context)
				throws IOException, InterruptedException {

			long up_flow_sum = 0;
			long d_flow_sum = 0;
			
			for(FlowBean bean:values){
				
				up_flow_sum += bean.getUp_flow();
				d_flow_sum += bean.getD_flow();
			}
			
			flowBean.set(key.toString(), up_flow_sum, d_flow_sum);
			
			context.write(key, flowBean);
			
		}
	}
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf,"flowpartjob");
		
		job.setJarByClass(FlowCountPartition.class);
		
		job.setMapperClass(FlowCountPartitionMapper.class);
		job.setReducerClass(FlowCountPartitionReducer.class);
		
		/**
		 * 加入自定义分区定义 ： AreaPartitioner
		 */
		job.setPartitionerClass(AreaPartitioner.class);
		
		/**
		 * 设置reduce task的数量，要跟AreaPartitioner返回的partition个数匹配
		 * 如果reduce task的数量比partitioner中分组数多，就会产生多余的几个空文件
		 * 如果reduce task的数量比partitioner中分组数少，就会发生异常，因为有一些key没有对应reducetask接收
		 * (如果reduce task的数量为1，也能正常运行，所有的key都会分给这一个reduce task)
		 * reduce task 或 map task 指的是，reuder和mapper在集群中运行的实例
		 */
		job.setNumReduceTasks(1);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowBean.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
	}
	
}

设置reduce task的数量，要跟AreaPartitioner返回的partition个数匹配
如果reduce task的数量比partitioner中分组数多，就会产生多余的几个空文件
如果reduce task的数量比partitioner中分组数少，就会发生异常，因为有一些key没有对应reducetask接收
(如果reduce task的数量为1，也能正常运行，所有的key都会分给这一个reduce task)
reduce task 或 map task 指的是，reuder和mapper在集群中运行的实例