1.内容
基础排序算法实战
二次排序算法实战
更高级排序算法
排序算法内幕解密
//修改一下log级别
scala> sc.setLogLevel("WARN")
2.二次排序就是排序的时候考虑两个维度,维度1相同时,再排维度2
例如数据源:
SecondSort.txt
2 3
4 1
3 2
4 3
9 7
2 1
3.实现Ordered(排序规则),Serializable接口的javabean:
/*
SecondSortKey.java
*/
package cn.whbing.spark.SparkApps.cores;
import java.io.Serializable;
import scala.math.Ordered;
/*
* 自定义二次排序,实现的是scala的接口,不是java中的排序接口
* */
public class SecondSortKey implements Ordered<SecondSortKey>,Serializable{
//需要二次排序的key
private int first;
private int second;
public int getFirst() {
return first;
}
public void setFirst(int first) {
this.first = first;
}
public int getSecond() {
return second;
}
public void setSecond(int second) {
this.second = second;
}
public SecondSortKey(int first,int second) {
this.first = first;
this.second = second;
}
@Override
public boolean $greater(SecondSortKey other) {
// 大于的时候的情况
if(this.first > other.getFirst()){
return true;
}else if(this.first == other.getFirst() && this.second > other.getSecond()){
return true;
}
return false;
}
@Override
public boolean $greater$eq(SecondSortKey other) {
// 大于等于的情况
if(this.$greater(other)){
return true;
}else if(this.first == other.getFirst() && this.second == other.getSecond()){
return true;
}
return false;
}
@Override
public boolean $less(SecondSortKey other) {
// 小于的情况
if(this.first < other.getFirst()){
return true;
}else if(this.first == other.getFirst() && this.second < other.getSecond()){
return true;
}
return false;
}
@Override
public boolean $less$eq(SecondSortKey other) {
// TODO 小于等于的情况
if(this.$less(other)){
return true;
}else if(this.first == other.getSecond() && this.second == other.getSecond()){
return true;
}
return false;
}
@Override
public int compare(SecondSortKey other) {
if(this.first - other.getFirst() !=0){
return this.first - other.getFirst();
}else {
return this.second - other.getSecond();
}
}
@Override
public int compareTo(SecondSortKey other) {
if(this.first - other.getFirst() !=0){
return this.first - other.getFirst();
}else {
return this.second - other.getSecond();
}
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + first;
result = prime * result + second;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
SecondSortKey other = (SecondSortKey) obj;
if (first != other.first)
return false;
if (second != other.second)
return false;
return true;
}
}
/*SecondSortApp.java*/
package cn.whbing.spark.SparkApps.cores;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
/*
* 二次排序:
* 第一步:按照Ordered和serializable接口实现自定义排序
* 第二步:将要排序的二次排序的文件加载进<Key, Value>类型的RDD
* 第三步:使用sortByKey基于自定义的Key进行二次排序
* 第四步:去除掉排序的Key,只保留排序后的结果
*
*/
public class SecondSortApp {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setAppName("SecondSort").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
sc.setLogLevel("WARN");
JavaRDD<String> lines = sc.textFile("D://javaTools//EclipseWork2//SparkApps//SecondSort.txt");
JavaPairRDD<SecondSortKey, String> pairs = lines.mapToPair(new PairFunction<String, SecondSortKey, String>() {
//String:lines读进来的内容, K2:处理的key,为SecondSortKey,V2:String
@Override
public Tuple2<SecondSortKey, String> call(String line) throws Exception {
String[] splited = line.split(" ");
SecondSortKey key = new SecondSortKey(
Integer.valueOf(splited[0]), Integer.valueOf(splited[1]));
return new Tuple2(key, line);
}
});
JavaPairRDD<SecondSortKey, String> sorted = pairs.sortByKey();//完成二次排序
//过滤掉排序后的key,保留原结果
JavaRDD<String> secondSorted = sorted.map(new Function<Tuple2<SecondSortKey,String>, String>() {
private static final long serialVersionUID = 1L;
@Override
public String call(Tuple2<SecondSortKey, String> sortedContent) throws Exception {
return sortedContent._2;
}
});
secondSorted.foreach(new VoidFunction<String>() {
@Override
public void call(String sorted) throws Exception {
System.out.println(sorted);
}
});
}
}
结果:
2 1
2 3
3 2
4 1
4 3
9 7
4.小结:、
对于待排序的每一行原数据,我们进行封装成javabean,并实现Ordered接口,将其作为key;
原来的行作为value;
对上述(k,v)进行sortByKey操作即进行自定义的排序操作;
最后去掉自己定义key即可。