二次排序
二次排序,对第1个字段相同的数据,使用第2个字段进行排序。说白了就是我们有的时候需要先按照key进行排序 如果key相同的情况下再按val排序;通过一个程序实现两次排序的玩法我们成为是二次排序;
举个例子,电商平台记录了每一用户的每一笔订单的订单金额,现在要求属于同一个用户的所有订单金额作排序,
并且输出的用户名也要排序。
这里涉及到了分组
分组 grouping
1) 概念:主要定义哪些key可以放置在一组,设置组之后reduce在处理的时候就可以分组并行处理,这样能提高reduce的并行运算性能;
2) 自定义分组排序
定义实现一个WritableComparator,重写compare(), 设置比较策略;
数据:
hadoop@apache 200
hive@apache 550
yarn@apache 580
hive@apache 159
hadoop@apache 300
hive@apache 258
hadoop@apache 100
首先自定义序列化比较类:
package com.hnxy.mr.Sort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class AccountWritable implements WritableComparable<AccountWritable> {
// 定义属性 账户姓名 订单金额
private String account;
private Long cost;
public String getAccount() {
return account;
}
public void setAccount(String account) {
this.account = account;
}
public Long getCost() {
return cost;
}
public void setCost(Long cost) {
this.cost = cost;
}
@Override
public String toString() {
return "[account=" + account + ", cost=" + cost + "]";
}
@Override
public void write(DataOutput out) throws IOException {
// 序列化
out.writeUTF(account);
out.writeLong(cost);
}
@Override
public void readFields(DataInput in) throws IOException {
// 反序列化
this.account = in.readUTF();
this.cost = in.readLong();
}
@Override
public int compareTo(AccountWritable o) {
// 业务判断 正序排序
return this.getAccount().compareTo(o.getAccount());
}
}
具体的实现类:
package com.hnxy.mr.Sort;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MyMRSecondarySort extends Configured implements Tool{
//hadoop@apache 200 数据
public static class SecondarySortMapper extends Mapper<LongWritable, Text, AccountWritable, NullWritable> {
AccountWritable outkey = new AccountWritable();
String[] str = null;
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, AccountWritable, NullWritable>.Context context)
throws IOException, InterruptedException {
// 常规业务逻辑
str = value.toString().split("\t");
if(str.length == 2 && str != null){
outkey.setAccount(str[0]);
outkey.setCost(Long.parseLong(str[1]));
context.write(outkey, NullWritable.get());
}
}
}
// 客户的要求是这样的 : hadoop方一个文件 yarn --> 放在一起 hive单独存放
private static class MyPartitioner extends Partitioner<AccountWritable, NullWritable>{
@Override
public int getPartition(AccountWritable key, NullWritable value, int numPartitions) {
// 自定义分区规则
if(key.getAccount().startsWith("hadoop")){
return 0;
}else{
return 1;
}
}
}
//排序 外部比较器
private static class MyCompartor extends WritableComparator{
public MyCompartor(){
super(AccountWritable.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
// 强转类型
AccountWritable aa = (AccountWritable)a;
AccountWritable ab = (AccountWritable)b;
//设置返回 名称倒序进行分组
int result = ab.getAccount().compareTo(aa.getAccount());
//如果组内第一值相同就比较第二个
if(result==0){
result = ab.getCost().compareTo(aa.getCost());
}
return result;
}
}
//分组
private static class MyCompartor1 extends WritableComparator{
public MyCompartor1(){
super(AccountWritable.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
// 强转类型
AccountWritable aa =(AccountWritable)a;
AccountWritable ab = (AccountWritable)b;
//设置返回值 名称和分组倒序进行排序
//这里有个疑问,为什么感觉像有进行了一次外部比较。其实这次是按照名字分组 相等就是返回0
int result = aa.getAccount().compareTo(ab.getAccount());
return result;
}
}
@Override
public int run(String[] args) throws Exception {
// 创建方法的返回值
int count = 0;
// 获取配置参数
Configuration conf = this.getConf();
// 判断输出目录是否存在
FileSystem fs = FileSystem.get(conf);
// 设定输入与输出类
Path in = new Path(args[0]);
Path out = new Path(args[1]);
if(fs.exists(out)){
fs.delete(out,true);
System.out.println("Old OutPut Path is Deleted!");
}
// 定义job
// -Djob_name=?
// 定义工作名称
String jobName = conf.get("job_name");
if(!(null != jobName && !"".equals(jobName.trim()))){
jobName = "Job_By_Su";
}
Job job = Job.getInstance(conf,jobName);
// 设置jar_mr类
job.setJarByClass(MyMRSecondarySort.class);
job.setMapperClass(SecondarySortMapper.class);
job.setNumReduceTasks(2);
// 按照名称的hashcode进行分区
// 先分组
job.setGroupingComparatorClass(MyCompartor1.class);
// 再分区
job.setPartitionerClass(MyPartitioner.class);
// 最后组内排序
job.setSortComparatorClass(MyCompartor.class);
// 输出类型怎么设置?
// 一定要自己设置一下输出类型
job.setMapOutputKeyClass(AccountWritable.class);
job.setMapOutputValueClass(NullWritable.class);
// 一定要自己设置一下输出类型
// 设置格式化类
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// 设置输入与输出路径
FileInputFormat.addInputPath(job, in);
FileOutputFormat.setOutputPath(job, out);
// 执行
count = job.waitForCompletion(true)?1:0;
// 返回
return count;
}
public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new MyMRSecondarySort(), args));
}
}