当我们在使用MapReduce自定义对象时会遇到需要对其进行排序的问题,这时我们就可以通过实现MapReduce的WritableComparable接口实现自定义排序的功能。
这里举一个对员工对象排序的例子。
1. 首先我们创建一个员工对象:
public class Emp implements Writable {
private int empno;
private String ename;
private String job;
private int mgr;
private String hiredata;
private int sal;
private int comm;
private int deptno;
}
2. 生成get和set方法 (get和set我就不写了)我们重写toString方法方便输出:
@Override
public String toString() {
return "Emp [empno="+empno+",ename="+ename+",sal="+sal+",deptno="+deptno+"]";
}
4. 然后实现MapReduce的序列化接口,注意这里我们实现的不是Writable接口而是他的子接口WritableComparable并实现其中的compareTo方法:
// 序列化的顺序要和反序列化的顺序一样
@Override
public void readFields(DataInput input) throws IOException {
// 实现反序列化,从输入流中读取对象
this.empno = input.readInt();
this.ename = input.readUTF();
this.job = input.readUTF();
this.mgr = input.readInt();
this.hiredata = input.readUTF();
this.sal = input.readInt();
this.comm = input.readInt();
this.deptno = input.readInt();
}
@Override
public void write(DataOutput output) throws IOException {
// 实现序列化,把对象输出到输出流
output.writeInt(empno);
output.writeUTF(ename);
output.writeUTF(job);
output.writeInt(mgr);
output.writeUTF(hiredata);
output.writeInt(sal);
output.writeInt(comm);
output.writeInt(deptno);
}
如果是单条件排序使用:
@Override
public int compareTo(Emp o) {
// 定义自己的排序规则
//按照薪水进行排序
if(this.sal>=o.sal) {
return 1;
}else {
return -1;
}
}
如果是多条件排序使用:
@Override
public int compareTo(Emp o) {
// 定义自己的排序规则
//先按照部门号排序,再照薪水进行排序
//注意:没有“=”
if(this.deptno>o.getDeptno()) {
return 1;
}else if(this.deptno<o.getDeptno()){
return -1;
}
//当部门号相等时 按照工资排序
if(this.sal>=o.sal) {
return 1;
}else {
return -1;
}
}
5. 完成Mapper和Main函数(这里就不用reducer了)
6. 查看输出:
首先将程序打包后上传到Hadoop机器上
使用 hadoop jar sort2.jar /scott/emp.csv /output/0304/sort2 执行(第一个参数是输入路径,第二个参数是输出路径 输出路径不能提前存在)
查看结果:
hdfs dfs -cat /output/0304/sort2/part-r-00000
(结果我用的是两个条件排序,先按照部门号排序再按照薪水排序)
Emp [empno=7934,ename=MILLER,sal=1300,deptno=10]
Emp [empno=7782,ename=CLARK,sal=2450,deptno=10]
Emp [empno=7839,ename=KING,sal=5000,deptno=10]
Emp [empno=7369,ename=SMITH,sal=800,deptno=20]
Emp [empno=7876,ename=ADAMS,sal=1100,deptno=20]
Emp [empno=7566,ename=JONES,sal=2975,deptno=20]
Emp [empno=7902,ename=FORD,sal=3000,deptno=20]
Emp [empno=7788,ename=SCOTT,sal=3000,deptno=20]
Emp [empno=7900,ename=JAMES,sal=950,deptno=30]
Emp [empno=7521,ename=WARD,sal=1250,deptno=30]
Emp [empno=7654,ename=MARTIN,sal=1250,deptno=30]
Emp [empno=7844,ename=TURNER,sal=1500,deptno=30]
Emp [empno=7499,ename=ALLEN,sal=1600,deptno=30]
Emp [empno=7698,ename=BLAKE,sal=2850,deptno=30]
完整项目和表文件我放到了github上:https://github.com/peiyixi/MapReduceSort
完整代码:
Emp类:
package sort.mapreduce;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
//代表员工
//数据:7369,SMITH,CLERK,7902,1980/12/17,800,0,20
public class Emp implements WritableComparable<Emp> {
private int empno;
private String ename;
private String job;
private int mgr;
private String hiredata;
private int sal;
private int comm;
private int deptno;
// @Override
// public int compareTo(Emp o) {
// // 定义自己的排序规则
// //按照薪水进行排序
// if(this.sal>=o.sal) {
// return 1;
// }else {
// return -1;
// }
//
// }
@Override
public int compareTo(Emp o) {
// 定义自己的排序规则
//先按照部门号排序,再照薪水进行排序
//注意:没有“=”
if(this.deptno>o.getDeptno()) {
return 1;
}else if(this.deptno<o.getDeptno()){
return -1;
}
//当部门号相等时 按照工资排序
if(this.sal>=o.sal) {
return 1;
}else {
return -1;
}
}
// 序列化的顺序要和反序列化的顺序一样
@Override
public void readFields(DataInput input) throws IOException {
// 实现反序列化,从输入流中读取对象
this.empno = input.readInt();
this.ename = input.readUTF();
this.job = input.readUTF();
this.mgr = input.readInt();
this.hiredata = input.readUTF();
this.sal = input.readInt();
this.comm = input.readInt();
this.deptno = input.readInt();
}
@Override
public void write(DataOutput output) throws IOException {
// 实现序列化,把对象输出到输出流
output.writeInt(empno);
output.writeUTF(ename);
output.writeUTF(job);
output.writeInt(mgr);
output.writeUTF(hiredata);
output.writeInt(sal);
output.writeInt(comm);
output.writeInt(deptno);
}
@Override
public String toString() {
return "Emp [empno="+empno+",ename="+ename+",sal="+sal+",deptno="+deptno+"]";
}
public String getEname() {
return ename;
}
public void setEname(String ename) {
this.ename = ename;
}
public String getJob() {
return job;
}
public void setJob(String job) {
this.job = job;
}
public int getMgr() {
return mgr;
}
public void setMgr(int mgr) {
this.mgr = mgr;
}
public String getHiredata() {
return hiredata;
}
public void setHiredata(String hiredata) {
this.hiredata = hiredata;
}
public int getSal() {
return sal;
}
public void setSal(int sal) {
this.sal = sal;
}
public int getDeptno() {
return deptno;
}
public void setDeptno(int deptno) {
this.deptno = deptno;
}
public int getEmpno() {
return empno;
}
public void setEmpno(int empno) {
this.empno = empno;
}
public int getComm() {
return comm;
}
public void setComm(int comm) {
this.comm = comm;
}
}
Mapper类:
package sort.mapreduce;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//只有k2可以排序
public class EmpSortMapper extends Mapper<LongWritable,Text,Emp,NullWritable>{
@Override
protected void map(LongWritable k1, Text v1,Context context)
throws IOException, InterruptedException {
//数据:7369,SMITH,CLERK,7902,1980/12/17,800,0,20
String data = v1.toString();
String[] words = data.split(",");
//生成员工对象
Emp emp = new Emp();
emp.setEmpno(Integer.parseInt(words[0]));
emp.setEname(words[1]);
emp.setJob(words[2]);
emp.setMgr(Integer.parseInt(words[3]));
emp.setHiredata(words[4]);
emp.setSal(Integer.parseInt(words[5]));
emp.setComm(Integer.parseInt(words[6]));
emp.setDeptno(Integer.parseInt(words[7]));
//输出员工对象 空值
context.write(emp,NullWritable.get());
}
}
Main方法:
package sort.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class EmpSortMain {
public static void main(String[] args) throws Exception {
//1、创建任务对象 定义主程序入口
Job job = Job.getInstance(new Configuration());
job.setJarByClass(EmpSortMain.class);
//2、setMapper
job.setMapperClass(EmpSortMapper.class);
job.setMapOutputKeyClass(Emp.class);
job.setMapOutputValueClass(NullWritable.class);
//指定自己的比较规则
//job.setSortComparatorClass(MyNumberComparator.class);
//3、setReducer
//job.setReducerClass(EmpInfoReducer.class);
job.setOutputKeyClass(Emp.class);
job.setOutputValueClass(NullWritable.class);
//4、定义输入和输出
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//5、执行程序
job.waitForCompletion(true);
}
}