--------------------------------------------------------------------------------------------------------------------
2、排序:
(*)Java的排序:实现接口Comparable
(*)MR的排序:按照Map的输出的key(按照key2进行排序)
(1)基本数据类型:
如果排序的过程中,使用了reducer,会去掉重复记录
数字: 默认:升序;如果要改变顺序,创建一个自定义比较器
字符串:默认:按照字典顺序;如果要改变顺序,创建一个自定义比较器
小结:(1)把数据作为key2
(2)value2没有值:NullWritable
(3)没有Reducer,如果有Reducer,去掉重复的数据
(2)对象进行排序:数据:员工表的数据
(*)复习:SQL语句的排序
order by 后面 + 列名、表达式、别名、序号
多个列的排序:先按照第一个排序,如果相同,再按照第二列排序,以此类推。
desc只作用于离他最近的列
select * from emp order by deptno desc,sal desc;
(难一点的问题)排序后的表,跟原来的表不是同一张表
(*)MR的排序:对员工的对象排序 -----> 实现WritableComparable
要求:(1)实现序列化 (2)可被排序
==================================================================
Java的排序:实现接口Comparablepackage demo.sort.java;
public class Student implements Comparable<Student>{
private int stuID;
private String stuName;
private int age;
@Override
public String toString() {
return "["+this.stuID+"\t"+this.stuName+"\t"+this.age+"]";
}
@Override
public int compareTo(Student o) {
// 按照年龄排序
if(this.age >= o.getAge()){
return 1;
}else{
return -1;
}
}
public int getStuID() {
return stuID;
}
public void setStuID(int stuID) {
this.stuID = stuID;
}
public String getStuName() {
return stuName;
}
public void setStuName(String stuName) {
this.stuName = stuName;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
}
-----------------------------------------------------------------------------------------------
package demo.sort.java;
import java.util.Arrays;
public class StudentMain {
public static void main(String[] args) {
// 创建三个学生
Student s1 = new Student();
s1.setStuID(1);
s1.setStuName("Tom");
s1.setAge(23);
Student s2 = new Student();
s2.setStuID(2);
s2.setStuName("Mary");
s2.setAge(26);
Student s3 = new Student();
s3.setStuID(3);
s3.setStuName("Mike");
s3.setAge(25);
//定义一个数组
Student[] list = {s1,s2,s3};
//排序
Arrays.sort(list);
//输出
for(Student s:list){
System.out.println(s);
}
}
}
==================================================================
(*)MR的排序:按照Map的输出的key(按照key2进行排序)
(*)MR的排序:对员工的对象排序 -----> 实现WritableComparable
要求:(1)实现序列化 (2)可被排序
--------------------------------
MR数字的排序
package demo.sort.mr.number;
import org.apache.hadoop.io.LongWritable;
//我自己的一个比较器,实现对数字的降序排序
public class MyNumberComparator extends LongWritable.Comparator {
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
// 定义自己的排序规则:降序
return -super.compare(b1, s1, l1, b2, s2, l2);
}
}
-------------------------------------------------------------------------------------------------
package demo.sort.mr.number;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class NumberMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {
@Override
protected void map(LongWritable key1, Text value1,Context context)
throws IOException, InterruptedException {
//数据: "25"
String data = value1.toString();
//输出
context.write(new LongWritable(Long.parseLong(data)), NullWritable.get());
}
}
-----------------------------------------------------------------------------------------------------
package demo.sort.mr.number;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class NumberMain {
public static void main(String[] args) throws Exception {
// 创建一个任务job = map + reduce
Job job = Job.getInstance(new Configuration());
//指定任务的入口
job.setJarByClass(NumberMain.class);
//指定任务的Map和输出的数据类型
job.setMapperClass(NumberMapper.class);
job.setMapOutputKeyClass(LongWritable.class);//部门号
job.setMapOutputValueClass(NullWritable.class);//员工对象
//指定我自己的比较器
job.setSortComparatorClass(MyNumberComparator.class);
//指定任务的输出的数据类型
job.setOutputKeyClass(LongWritable.class);//部门号
job.setOutputValueClass(NullWritable.class); // 部门的工资总额
//指定输入和输出的HDFS路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//提交任务
job.waitForCompletion(true);
}
}
==============================================================
对象的排序
package demo.sort.mr.object;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
//员工类: 7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
public class Employee implements WritableComparable<Employee>{
private int empno;//员工号
private String ename;//姓名
private String job;//职位
private int mgr;//老板号
private String hiredate;//入职日期
private int sal;//月薪
private int comm;//奖金
private int deptno;// 部门号
// @Override
// public int compareTo(Employee o) {
// // 定义排序规则:一个列的排序,按照薪水排序
// if(this.sal >= o.getSal()){
// return 1;
// }else{
// return -1;
// }
// }
@Override
public int compareTo(Employee o) {
// 定义排序规则:两个列的排序,按照部门号和薪水排序
if(this.deptno > o.getDeptno()){
return 1;
}else if(this.deptno < o.getDeptno()){
return -1;
}
//如果是等号,就表示部门号相同,就按照薪水排序
if(this.sal >= o.getSal()){
return 1;
}else{
return -1;
}
}
@Override
public String toString() {
return "["+this.empno+"\t"+this.ename+"\t"+this.sal+"\t"+this.deptno+"]";
}
@Override
public void write(DataOutput output) throws IOException {
// 序列化:把对象输出
output.writeInt(this.empno);
output.writeUTF(this.ename);
output.writeUTF(this.job);
output.writeInt(this.mgr);
output.writeUTF(this.hiredate);
output.writeInt(this.sal);
output.writeInt(this.comm);
output.writeInt(this.deptno);
}
@Override
public void readFields(DataInput input) throws IOException {
// 反序列化:把对象读入
this.empno = input.readInt();
this.ename = input.readUTF();
this.job = input.readUTF();
this.mgr = input.readInt();
this.hiredate = input.readUTF();
this.sal = input.readInt();
this.comm = input.readInt();
this.deptno = input.readInt();
}
public int getEmpno() {
return empno;
}
public void setEmpno(int empno) {
this.empno = empno;
}
public String getEname() {
return ename;
}
public void setEname(String ename) {
this.ename = ename;
}
public String getJob() {
return job;
}
public void setJob(String job) {
this.job = job;
}
public int getMgr() {
return mgr;
}
public void setMgr(int mgr) {
this.mgr = mgr;
}
public String getHiredate() {
return hiredate;
}
public void setHiredate(String hiredate) {
this.hiredate = hiredate;
}
public int getSal() {
return sal;
}
public void setSal(int sal) {
this.sal = sal;
}
public int getComm() {
return comm;
}
public void setComm(int comm) {
this.comm = comm;
}
public int getDeptno() {
return deptno;
}
public void setDeptno(int deptno) {
this.deptno = deptno;
}
}
--------------------------------------------------------------------------------------------------------
package demo.sort.mr.object;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
// 把这个员工作为Key2
public class EmployeeSortMapper extends Mapper<LongWritable, Text, Employee, NullWritable> {
@Override
protected void map(LongWritable key1, Text value1,Context context)
throws IOException, InterruptedException {
// 数据:7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
String data = value1.toString();
//分词
String[] words = data.split(",");
//创建一个员工对象
Employee e = new Employee();
//设置员工号
e.setEmpno(Integer.parseInt(words[0]));
//设置姓名
e.setEname(words[1]);
//设置职位 job
e.setJob(words[2]);
//设置老板号
try{
e.setMgr(Integer.parseInt(words[3]));
}catch(Exception ex){
//老板号为null
e.setMgr(0);
}
//设置入职日期
e.setHiredate(words[4]);
//设置薪水
e.setSal(Integer.parseInt(words[5]));
//设置奖金
try{
e.setComm(Integer.parseInt(words[6]));
}catch(Exception ex){
//没有奖金
e.setComm(0);
}
//设置部门号
e.setDeptno(Integer.parseInt(words[7]));
//输出:一定要把员工对象作为key2
context.write(e, NullWritable.get());
}
}
-------------------------------------------------------------------------------------------------
package demo.sort.mr.object;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
// k4是员工号 v4是员工
public class EmployeeSortReducer extends Reducer<Employee, NullWritable, LongWritable, Employee> {
@Override
protected void reduce(Employee k3, Iterable<NullWritable> v3,Context context)
throws IOException, InterruptedException {
context.write(new LongWritable(k3.getEmpno()), k3);
}
}
------------------------------------------------------------------------------------------------------
package demo.sort.mr.object;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class EmployeeSortMain {
public static void main(String[] args) throws Exception {
// 创建一个任务job = map + reduce
Job job = Job.getInstance(new Configuration());
//指定任务的入口
job.setJarByClass(EmployeeSortMain.class);
//指定任务的Map和输出的数据类型
job.setMapperClass(EmployeeSortMapper.class);
job.setMapOutputKeyClass(Employee.class);
job.setMapOutputValueClass(NullWritable.class);
//指定任务的Reduce和输出的数据类型
job.setReducerClass(EmployeeSortReducer.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Employee.class);
//指定输入和输出的HDFS路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//提交任务
job.waitForCompletion(true);
}
}