MapReduce--自定义全局Sort、分区Sort
1 Sort
- Sort发生在Map之后Reduce之前
- 数据经过MapRedcue处理,首先经过Map阶段,然后经过Shuffle阶段
- 在Shuffle阶段,会对数据进行分割(spilt)、分区(partitioner)、排序(sort)、合并(combine)、压缩(compress)、分组(group)之后输出到Reduce端
- 全局排序类似于sql中的order by,只有一个reduce输出,保证全局有序
- 分区/局部排序保证每个分区有序
- 二次排序是有多个排序条件
2 自定义 MapReduce 全排
2.1 需求
第一个字段:手机号
第二个字段:上行流量
第三个字段:下行流量
第四个字段:流量总和
按照流量之和进行排序
2.2 数据
13480253104 180 180 360
13502468823 7335 110349 117684
13560436666 1116 954 2070
13560439658 2034 5892 7926
13602846565 1938 2910 4848
13660577991 6960 690 7650
13719199419 240 0 240
13726230503 2481 24681 27162
13726238888 12481 44681 57162
13760778710 120 120 240
13826544101 264 0 264
13922314466 3008 3720 6728
13925057413 11058 48243 59301
13926251106 240 0 240
13926435656 132 1512 1644
15013685858 3659 3538 7197
15920133257 3156 2936 6092
15989002119 1938 180 2118
18211575961 1527 2106 3633
18320173382 9531 2412 11943
84138413 4116 1432 5548
2.3 Code
2.3.1 AllSortDriver Code
package com.xk.bigata.hadoop.mapreduce.sort;
import com.xk.bigata.hadoop.mapreduce.domain.AccessAllSortDomain;
import com.xk.bigata.hadoop.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class AllSortDriver {
public static void main(String[] args) throws Exception {
String input = "mapreduce-basic/data/sort.data";
String output = "mapreduce-basic/out";
// 1 创建 MapReduce job
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 删除输出路径
FileUtils.deleteFile(job.getConfiguration(), output);
// 2 设置运行主类
job.setJarByClass(AllSortDriver.class);
// 3 设置Map和Reduce运行的类
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// 4 设置Map 输出的 KEY 和 VALUE 数据类型
job.setMapOutputKeyClass(AccessAllSortDomain.class);
job.setMapOutputValueClass(Text.class);
// 5 设置Reduce 输出 KEY 和 VALUE 数据类型
job.setOutputKeyClass(AccessAllSortDomain.class);
job.setOutputValueClass(NullWritable.class);
// 6 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
job.setNumReduceTasks(1);
// 7 提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
public static class MyMapper extends Mapper<LongWritable, Text, AccessAllSortDomain, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] spilts = value.toString().split("\t");
String phone = spilts[0];
long up = Long.parseLong(spilts[1]);
long down = Long.parseLong(spilts[2]);
long sum = Long.parseLong(spilts[3]);
context.write(new AccessAllSortDomain(phone, up, down, sum), new Text(phone));
}
}
public static class MyReducer extends Reducer<AccessAllSortDomain, Text, AccessAllSortDomain, NullWritable> {
@Override
protected void reduce(AccessAllSortDomain key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
context.write(key, NullWritable.get());
}
}
}
}
2.3.2 AccessAllSortDomain Code
package com.xk.bigata.hadoop.mapreduce.domain;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 自定义数据类型步骤:
* 1. 实现WritableComparable/Writable接口
* 2. 重写 write、readFields、compareTo方法
* 3. 要有一个无参的构造器
* 4. write和readFields的字段顺序要保持一致
* 5. 输出结果:需要重写toString(非必选)
*/
public class AccessAllSortDomain implements WritableComparable<AccessAllSortDomain> {
private String phone;
private Long up;
private Long down;
private Long sum;
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public Long getUp() {
return up;
}
public void setUp(Long up) {
this.up = up;
}
public Long getDown() {
return down;
}
public void setDown(Long down) {
this.down = down;
}
public Long getSum() {
return sum;
}
public void setSum(Long sum) {
this.sum = sum;
}
public AccessAllSortDomain() {
}
public AccessAllSortDomain(String phone, Long up, Long down, Long sum) {
this.phone = phone;
this.up = up;
this.down = down;
this.sum = sum;
}
@Override
public String toString() {
return phone + '\t' +
up + '\t' +
down + '\t' +
sum;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(phone);
out.writeLong(up);
out.writeLong(down);
out.writeLong(sum);
}
@Override
public void readFields(DataInput in) throws IOException {
phone = in.readUTF();
up = in.readLong();
down = in.readLong();
sum = in.readLong();
}
@Override
public int compareTo(AccessAllSortDomain o) {
return this.getSum() > o.getSum() ? -1 : 1;
}
}
2.4 结果
13502468823 7335 110349 117684
13925057413 11058 48243 59301
13726238888 12481 44681 57162
13726230503 2481 24681 27162
18320173382 9531 2412 11943
13560439658 2034 5892 7926
13660577991 6960 690 7650
15013685858 3659 3538 7197
13922314466 3008 3720 6728
15920133257 3156 2936 6092
84138413 4116 1432 5548
13602846565 1938 2910 4848
18211575961 1527 2106 3633
15989002119 1938 180 2118
13560436666 1116 954 2070
13926435656 132 1512 1644
13480253104 180 180 360
13826544101 264 0 264
13926251106 240 0 240
13760778710 120 120 240
13719199419 240 0 240
3 自定义 MapReduce 分区内排序
3.1 需求
第一个字段:手机号
第二个字段:上行流量
第三个字段:下行流量
第四个字段:流量总和
手机号为13开头的一个分区
手机号为18开头的一个分区
其他一个分区
按照流量之和进行分区排序
3.2 数据
13480253104 180 180 360
13502468823 7335 110349 117684
13560436666 1116 954 2070
13560439658 2034 5892 7926
13602846565 1938 2910 4848
13660577991 6960 690 7650
13719199419 240 0 240
13726230503 2481 24681 27162
13726238888 12481 44681 57162
13760778710 120 120 240
13826544101 264 0 264
13922314466 3008 3720 6728
13925057413 11058 48243 59301
13926251106 240 0 240
13926435656 132 1512 1644
15013685858 3659 3538 7197
15920133257 3156 2936 6092
15989002119 1938 180 2118
18211575961 1527 2106 3633
18320173382 9531 2412 11943
84138413 4116 1432 5548
3.3 Code
3.3.1 PartitionSortDriver Code
package com.xk.bigata.hadoop.mapreduce.sort;
import com.xk.bigata.hadoop.mapreduce.domain.AccessAllSortDomain;
import com.xk.bigata.hadoop.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class PartitionSortDriver {
public static void main(String[] args) throws Exception {
String input = "mapreduce-basic/data/sort.data";
String output = "mapreduce-basic/out";
// 1 创建 MapReduce job
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 设置自定义分区类
job.setPartitionerClass(PhonePartitioner.class);
// 删除输出路径
FileUtils.deleteFile(job.getConfiguration(), output);
// 2 设置运行主类
job.setJarByClass(PartitionSortDriver.class);
// 3 设置Map和Reduce运行的类
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// 4 设置Map 输出的 KEY 和 VALUE 数据类型
job.setMapOutputKeyClass(AccessAllSortDomain.class);
job.setMapOutputValueClass(Text.class);
// 5 设置Reduce 输出 KEY 和 VALUE 数据类型
job.setOutputKeyClass(AccessAllSortDomain.class);
job.setOutputValueClass(NullWritable.class);
// 6 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
// 设置 reduce task 个数
job.setNumReduceTasks(3);
// 7 提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
public static class MyMapper extends Mapper<LongWritable, Text, AccessAllSortDomain, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] spilts = value.toString().split("\t");
String phone = spilts[0];
long up = Long.parseLong(spilts[1]);
long down = Long.parseLong(spilts[2]);
long sum = Long.parseLong(spilts[3]);
context.write(new AccessAllSortDomain(phone, up, down, sum), new Text(phone));
}
}
public static class MyReducer extends Reducer<AccessAllSortDomain, Text, AccessAllSortDomain, NullWritable> {
@Override
protected void reduce(AccessAllSortDomain key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
context.write(key, NullWritable.get());
}
}
}
}
3.3.2 PhonePartitioner Code
package com.xk.bigata.hadoop.mapreduce.sort;
import com.xk.bigata.hadoop.mapreduce.domain.AccessAllSortDomain;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class PhonePartitioner extends Partitioner<AccessAllSortDomain, Text> {
@Override
public int getPartition(AccessAllSortDomain accessAllSortDomain, Text text, int numPartitions) {
String phone = accessAllSortDomain.getPhone();
if (phone.startsWith("13")) {
return 0;
} else if (phone.startsWith("18")) {
return 1;
} else {
return 2;
}
}
}
3.4 结果
3.4.1 part-r-00000
13502468823 7335 110349 117684
13925057413 11058 48243 59301
13726238888 12481 44681 57162
13726230503 2481 24681 27162
13560439658 2034 5892 7926
13660577991 6960 690 7650
13922314466 3008 3720 6728
13602846565 1938 2910 4848
13560436666 1116 954 2070
13926435656 132 1512 1644
13480253104 180 180 360
13826544101 264 0 264
13719199419 240 0 240
13926251106 240 0 240
13760778710 120 120 240
3.4.2 part-r-00001
18320173382 9531 2412 11943
18211575961 1527 2106 3633
3.4.3 part-r-00002
15013685858 3659 3538 7197
15920133257 3156 2936 6092
84138413 4116 1432 5548
15989002119 1938 180 2118