首先需要明确的一点就是在Hadoop 技术框架下 key 必须实现 WritableComparable 接口,而value必须实现 Writable 接口,下面举两个自定义数据类型来描述这个场景。
我们需要对某个流量端口文件进行流量统计,这时我们需要定义一个流量类。
package definyType;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class LiuliangTongji implements Writable {
long upPackNum,downPackNum,upPayLoad,downPayLoad;
public LiuliangTongji(){
}
@Override
public String toString() {
return "LiuliangTongji [upPackNum=" + upPackNum + "\tdownPackNum="
+ downPackNum + "\tupPayLoad=" + upPayLoad + "\tdownPayLoad="
+ downPayLoad + "]";
}
public LiuliangTongji(String upPackNum, String downPackNum, String upPayLoad,
String downPayLoad) {
super();
this.upPackNum = Long.parseLong(upPackNum);
this.downPackNum = Long.parseLong(downPackNum);
this.upPayLoad = Long.parseLong(upPayLoad);
this.downPayLoad = Long.parseLong(downPayLoad);
}
// 反序列化
@Override
public void readFields(DataInput in) throws IOException {
this.upPackNum = in.readLong();
this.downPackNum = in.readLong();
this.upPayLoad = in.readLong();
this.downPayLoad = in.readLong();
}
//序列化
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(upPackNum);
out.writeLong(downPackNum);
out.writeLong(upPayLoad);
out.writeLong(downPayLoad);
}
}
设计分布式中 key-value对如下代码:
package definyType;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class LiuliangCount extends Configured implements Tool {
public static class Map extends
Mapper<LongWritable, Text, Text, LiuliangTongji> {
//Mapper中四个参数一定要明白,k1,v1是原始读入时,k2,v2是传给reduce类型
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 这里得到是系统读文件得到的key-value对,map函数需要就是处理得到中间key-value对,给Reducer
String[] splits = value.toString().split("\t");
LiuliangTongji lilTj = new LiuliangTongji(splits[1],
splits[2],splits[3],splits[4]);
Text key2 = new Text(splits[0]);
context.write(key2, lilTj);
}
}
public static class Reduce extends
Reducer<Text, LiuliangTongji, Text, LiuliangTongji> {
public void reduce(Text key, Iterable<LiuliangTongji> values,
Context context) throws IOException, InterruptedException {
long upPackNum=0L,downPackNum=0L
,upPayLoad=0L,downPayLoad=0L;
for (LiuliangTongji val : values) {
upPackNum += val.upPackNum;
downPackNum += val.downPackNum;
upPayLoad += val.upPayLoad;
downPayLoad += val.downPayLoad;
}
LiuliangTongji v3 = new LiuliangTongji(upPackNum+"",
downPackNum+"",upPayLoad+"",downPayLoad+"");
context.write(key, v3);
}
}
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Path outpath = new Path(args[1]);
FileSystem fileSystem = FileSystem.get(new URI(args[1]),conf);
if(fileSystem.exists(outpath))
fileSystem.delete(outpath,true);
Job job = new Job(conf,"LiuliangCount");
FileInputFormat.setInputPaths(job, args[0]);
FileOutputFormat.setOutputPath(job, outpath);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LiuliangTongji.class);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int ret = ToolRunner.run(new LiuliangCount(), args);
System.exit(ret);
}
}
假如现在我们需要根据长方形面积进行排序,这时key需要是我们自定义的类型需要实现 WritableComparable接口,如下代码:
package keySortedDemo;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class RectangleSort {
static final String Input_Path = "hdfs://localhost:9000/user/huruzun/input1/data1";
static final String Output_Path = "hdfs://localhost:9000/user/huruzun/output";
public static void main(String[] args) throws IOException,
URISyntaxException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
FileSystem fileSystem = FileSystem.get(new URI(Input_Path), conf);
Path outpath = new Path(Output_Path);
if (fileSystem.exists(outpath)) {
fileSystem.delete(outpath, true);
}
Job job = new Job(conf, "RectangleSort");
job.setJarByClass(RectangleSort.class);
FileInputFormat.setInputPaths(job, Input_Path);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(RectangleWritable.class);
job.setMapOutputValueClass(NullWritable.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
FileOutputFormat.setOutputPath(job, new Path(Output_Path));
job.setOutputFormatClass(TextOutputFormat.class);
// 把任务根据自己设定的划分策略放在不同task执行,必须jar运行,执行运行报错
job.setPartitionerClass(MyPatitioner.class);
job.setNumReduceTasks(2);
job.waitForCompletion(true);
}
static class MyMapper extends
Mapper<LongWritable, Text, RectangleWritable, NullWritable> {
protected void map(LongWritable k1, Text v1, Context context)
throws IOException, InterruptedException {
String[] splits = v1.toString().split("\t");
RectangleWritable k2 = new RectangleWritable(
Integer.parseInt(splits[0]), Integer.parseInt(splits[1]));
context.write(k2, NullWritable.get());
}
}
// 这个导致相同面积只留下来一个
static class MyReducer extends
Reducer<RectangleWritable, NullWritable, IntWritable, IntWritable> {
protected void reduce(RectangleWritable k2, Iterable<NullWritable> v2s,
Context context) throws IOException, InterruptedException {
context.write(new IntWritable(k2.getLength()),
new IntWritable(k2.getWidth()));
}
}
}
class RectangleWritable implements WritableComparable {
int length, width;
public RectangleWritable(int length, int width) {
super();
this.length = length;
this.width = width;
}
public RectangleWritable() {
super();
}
public int getLength() {
return length;
}
public void setLength(int length) {
this.length = length;
}
public int getWidth() {
return width;
}
public void setWidth(int width) {
this.width = width;
}
@Override
public void readFields(DataInput in) throws IOException {
this.length = in.readInt();
this.width = in.readInt();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(length);
out.writeInt(width);
}
@Override
public int compareTo(Object o) {
RectangleWritable to = (RectangleWritable) o;
if (this.getLength() * this.getWidth() > to.getLength() * to.getWidth())
return 1;
else if (this.getLength() * this.getWidth() < to.getLength()
* to.getWidth())
return -1;
else
return 0;
}
}
class MyPatitioner extends Partitioner<RectangleWritable, NullWritable> {
@Override
public int getPartition(RectangleWritable k2, NullWritable v2,
int numReduceTask) {
if (k2.getLength() == k2.getWidth())
return 0; // 正方形在这个task
else
return 1; // 长方形在这个task
}
}
我们可能注意到了有个 MyPatitioner类,这个是继承分区类,前面我们发现我们没有写分区,系统默认是Hash 分区实现。
还有需要注意的是分区代码运行必须是命令行执行的。