Mapreduce的排序都是基于K2排序的:
自定义字符串Text类型的排序
自定义数字intWritable的排序
自定义自定义数据类型的排序
MapReducer的字符排序是按照字典顺序排序的,数字排序则是按照从小到大的排序的,
如果我们想修改默认的排序需要继承他们的对应类型的Comparator的类
import org.apache.hadoop.io.Text.Comparator;
public class SortText extends Comparator
{
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
// TODO Auto-generated method stub
return -super.compare(b1, s1, l1, b2, s2, l2);
}
}
这俩种方式都可以实现,继承完这个类的话我们需要重写他的方法compare,不然还是默认排序
然后在在任务中添加排序规则就可以了
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WCJob
{
public static void main(String[] args) throws Exception {
//创建一个任务
Job job=Job.getInstance(new Configuration());
job.setJarByClass(WCJob.class);
//指定map和map的输出类型
job.setMapperClass(WCMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置自定义排序
job.setSortComparatorClass(SortText.class);
//指定reduce和reduce的输出类型
job.setReducerClass(WCReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//指定任务的输入文件和输出文件
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//启动任务
job.waitForCompletion(true);
}
}
自定义数字类型的排序示例:
package day09.mapreducer.sortEmploy;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class Employee implements WritableComparable<Employee>
{
private int empno;
private String name ;
private String job;
private int mgr;
private String hiredate;
private int sal;
private int comm;
private int deptno;
//自定义比较器,根据部门排序,同一个部门然后更具工资排序
@Override
public int compareTo(Employee o) {
//单列比较
// if (this.deptno>o.getDeptno())
// {
// return 1;
// }else {
// return -1;
// }
//多列进行比较
if (this.deptno>o.getDeptno())
{
return 1;
}else if (this.deptno<o.getDeptno()) {
return -1;
}
if (this.sal>=o.getSal())
{
return 1;
}else {
return -1;
}
}
@Override
public String toString() {
return "Employee [empno=" + empno + ", name=" + name + ", sal=" + sal + ", deptno=" + deptno + "]";
}
@Override
public void readFields(DataInput input) throws IOException {
this.empno=input.readInt();
this.name=input.readUTF();
this.job=input.readUTF();
this.mgr=input.readInt();
this.hiredate=input.readUTF();
this.sal=input.readInt();
this.comm=input.readInt();
this.deptno=input.readInt();
}
@Override
public void write(DataOutput output) throws IOException {
output.writeInt(this.empno);
output.writeUTF(this.name);
output.writeUTF(this.job);
output.writeInt(this.mgr);
output.writeUTF(this.hiredate);
output.writeInt(this.sal);
output.writeInt(comm);
output.writeInt(deptno);
}
public int getEmpno() {
return empno;
}
public void setEmpno(int empno) {
this.empno = empno;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getJob() {
return job;
}
public void setJob(String job) {
this.job = job;
}
public int getMgr() {
return mgr;
}
public void setMgr(int mgr) {
this.mgr = mgr;
}
public String getHiredate() {
return hiredate;
}
public void setHiredate(String hiredate) {
this.hiredate = hiredate;
}
public int getSal() {
return sal;
}
public void setSal(int sal) {
this.sal = sal;
}
public int getComm() {
return comm;
}
public void setComm(int comm) {
this.comm = comm;
}
public int getDeptno() {
return deptno;
}
public void setDeptno(int deptno) {
this.deptno = deptno;
}
}
在自定义排序的类型中药注意的是返回数中如果没有1就是-1,千万不能少写,如果少写了就导致排序不正确
如果大于返回1那么就就升序,如果大于返回的是-1那么就是降序
多列排序的时候注意最后一个字段要写大于等于
//创建任务
Job job=Job.getInstance(new Configuration());
job.setJarByClass(EmployeeJobMain.class);
//指定map和map的输出类型
job.setMapperClass(EmployeeMapper.class);
job.setMapOutputKeyClass(Employee.class);
job.setMapOutputValueClass(NullWritable.class);
//指定reduce和reduce的输出类型,这里我们因为没有写reducer程序所以直接指定输出类型就可以了
job.setOutputKeyClass(Employee.class);
job.setOutputValueClass(NullWritable.class);
//指定任务的输入和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//启动任务
job.waitForCompletion(true);