Hadoop 自定义数据类型和自定义排序

最新推荐文章于 2021-03-19 16:54:04 发布

葫芦赛赛

最新推荐文章于 2021-03-19 16:54:04 发布

阅读量1.5k

点赞数

分类专栏： Hadoop实战学习数据挖掘与机器学习文章标签： hadoop

本文链接：https://blog.csdn.net/huruzun/article/details/48787647

版权

数据挖掘与机器学习同时被 2 个专栏收录

27 篇文章 1 订阅

订阅专栏

Hadoop实战学习

4 篇文章 1 订阅

订阅专栏

首先需要明确的一点就是在Hadoop 技术框架下 key 必须实现 WritableComparable 接口，而value必须实现 Writable 接口，下面举两个自定义数据类型来描述这个场景。

我们需要对某个流量端口文件进行流量统计，这时我们需要定义一个流量类。

package definyType;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class LiuliangTongji implements Writable {
	long upPackNum,downPackNum,upPayLoad,downPayLoad;
	
	public LiuliangTongji(){
		
	}
	
	@Override
	public String toString() {
		return "LiuliangTongji [upPackNum=" + upPackNum + "\tdownPackNum="
				+ downPackNum + "\tupPayLoad=" + upPayLoad + "\tdownPayLoad="
				+ downPayLoad + "]";
	}

	public LiuliangTongji(String upPackNum, String downPackNum, String upPayLoad,
			String downPayLoad) {
		super();
		this.upPackNum = Long.parseLong(upPackNum);
		this.downPackNum = Long.parseLong(downPackNum);
		this.upPayLoad = Long.parseLong(upPayLoad);
		this.downPayLoad = Long.parseLong(downPayLoad);
	}
	// 反序列化
	@Override
	public void readFields(DataInput in) throws IOException {
		this.upPackNum = in.readLong();
		this.downPackNum = in.readLong();
		this.upPayLoad = in.readLong();
		this.downPayLoad = in.readLong();
	}
	//序列化
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeLong(upPackNum);
		out.writeLong(downPackNum);
		out.writeLong(upPayLoad);
		out.writeLong(downPayLoad);
	}

}

设计分布式中 key-value对如下代码：

package definyType;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class LiuliangCount extends Configured implements Tool {
	public static class Map extends
			Mapper<LongWritable, Text, Text, LiuliangTongji> {
		//Mapper中四个参数一定要明白，k1,v1是原始读入时，k2,v2是传给reduce类型
		public void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			// 这里得到是系统读文件得到的key-value对，map函数需要就是处理得到中间key-value对，给Reducer
			String[] splits = value.toString().split("\t");
			LiuliangTongji lilTj = new LiuliangTongji(splits[1],
					splits[2],splits[3],splits[4]);
			Text key2 = new Text(splits[0]);
			context.write(key2, lilTj);
		}
	}

	public static class Reduce extends
			Reducer<Text, LiuliangTongji, Text, LiuliangTongji> {
		public void reduce(Text key, Iterable<LiuliangTongji> values,
				Context context) throws IOException, InterruptedException {
			long upPackNum=0L,downPackNum=0L
					,upPayLoad=0L,downPayLoad=0L;
			
			for (LiuliangTongji val : values) {
				upPackNum += val.upPackNum;
				downPackNum += val.downPackNum;
				upPayLoad += val.upPayLoad;
				downPayLoad += val.downPayLoad;
			}
			LiuliangTongji v3 = new LiuliangTongji(upPackNum+"",
					downPackNum+"",upPayLoad+"",downPayLoad+"");
			context.write(key, v3);
		}
	}

	public int run(String[] args) throws Exception {

		Configuration conf = new Configuration();
		Path outpath = new Path(args[1]);
		FileSystem fileSystem = FileSystem.get(new URI(args[1]),conf);
		if(fileSystem.exists(outpath))
			fileSystem.delete(outpath,true);
		
		Job job = new Job(conf,"LiuliangCount");
		FileInputFormat.setInputPaths(job, args[0]);
		FileOutputFormat.setOutputPath(job, outpath);
		job.setMapperClass(Map.class);
		job.setReducerClass(Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LiuliangTongji.class);
		
		
		boolean success = job.waitForCompletion(true);
		return success ? 0 : 1;
	}

	public static void main(String[] args) throws Exception {
		int ret = ToolRunner.run(new LiuliangCount(), args);
		System.exit(ret);
	}
}

假如现在我们需要根据长方形面积进行排序，这时key需要是我们自定义的类型需要实现 WritableComparable接口，如下代码：

package keySortedDemo;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class RectangleSort {
	static final String Input_Path = "hdfs://localhost:9000/user/huruzun/input1/data1";
	static final String Output_Path = "hdfs://localhost:9000/user/huruzun/output";

	public static void main(String[] args) throws IOException,
			URISyntaxException, InterruptedException, ClassNotFoundException {
		Configuration conf = new Configuration();
		FileSystem fileSystem = FileSystem.get(new URI(Input_Path), conf);
		Path outpath = new Path(Output_Path);
		if (fileSystem.exists(outpath)) {
			fileSystem.delete(outpath, true);
		}
		Job job = new Job(conf, "RectangleSort");
		job.setJarByClass(RectangleSort.class);

		FileInputFormat.setInputPaths(job, Input_Path);
		job.setInputFormatClass(TextInputFormat.class);

		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(RectangleWritable.class);
		job.setMapOutputValueClass(NullWritable.class);

		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(IntWritable.class);

		FileOutputFormat.setOutputPath(job, new Path(Output_Path));
		job.setOutputFormatClass(TextOutputFormat.class);
		// 把任务根据自己设定的划分策略放在不同task执行，必须jar运行，执行运行报错
		job.setPartitionerClass(MyPatitioner.class);
		job.setNumReduceTasks(2);

		job.waitForCompletion(true);
	}

	static class MyMapper extends
			Mapper<LongWritable, Text, RectangleWritable, NullWritable> {
		protected void map(LongWritable k1, Text v1, Context context)
				throws IOException, InterruptedException {
			String[] splits = v1.toString().split("\t");
			RectangleWritable k2 = new RectangleWritable(
					Integer.parseInt(splits[0]), Integer.parseInt(splits[1]));
			context.write(k2, NullWritable.get());
		}
	}

	// 这个导致相同面积只留下来一个
	static class MyReducer extends
			Reducer<RectangleWritable, NullWritable, IntWritable, IntWritable> {
		protected void reduce(RectangleWritable k2, Iterable<NullWritable> v2s,
				Context context) throws IOException, InterruptedException {
			context.write(new IntWritable(k2.getLength()),
					new IntWritable(k2.getWidth()));
		}
	}
}

class RectangleWritable implements WritableComparable {
	int length, width;

	public RectangleWritable(int length, int width) {
		super();
		this.length = length;
		this.width = width;
	}

	public RectangleWritable() {
		super();
	}

	public int getLength() {
		return length;
	}

	public void setLength(int length) {
		this.length = length;
	}

	public int getWidth() {
		return width;
	}

	public void setWidth(int width) {
		this.width = width;
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.length = in.readInt();
		this.width = in.readInt();
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeInt(length);
		out.writeInt(width);
	}

	@Override
	public int compareTo(Object o) {
		RectangleWritable to = (RectangleWritable) o;
		if (this.getLength() * this.getWidth() > to.getLength() * to.getWidth())
			return 1;
		else if (this.getLength() * this.getWidth() < to.getLength()
				* to.getWidth())
			return -1;
		else
			return 0;
	}

}

class MyPatitioner extends Partitioner<RectangleWritable, NullWritable> {

	@Override
	public int getPartition(RectangleWritable k2, NullWritable v2,
			int numReduceTask) {
		if (k2.getLength() == k2.getWidth())
			return 0; // 正方形在这个task
		else
			return 1; // 长方形在这个task
	}
}

我们可能注意到了有个 MyPatitioner类，这个是继承分区类，前面我们发现我们没有写分区，系统默认是Hash 分区实现。

还有需要注意的是分区代码运行必须是命令行执行的。