java的代码:
自定义key
package com.netcloud.spark.sparkcore.projectpractice;
import scala.math.Ordered;
import java.io.Serializable;
import java.util.Objects;
/**
* 自定义的二次排序Key
* 1)实现Ordered 、Serializable接口
* @author yangshaojun
* #date 2019/3/14 20:54
* @version 1.0
*/
public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable {
//在自定义key里面,定义需要进行排序的列
private int first;
private int second;
public SecondarySortKey(int first, int second) {
this.first = first;
this.second = second;
}
//重新大于方法
@Override
public boolean $greater(SecondarySortKey other) {
if (this.first > other.getFirst()) {
return true;
} else if (this.first == other.getFirst() && this.second > other.getSecond()) {
return true;
}
return false;
}
//重写大于等于方法
@Override
public boolean $greater$eq(SecondarySortKey other) {
if (this.$greater(other)) {
return true;
} else if (this.first == other.first && this.second == other.getSecond()) {
return true;
}
return false;
}
//重新小于的方法
@Override
public boolean $less(SecondarySortKey other) {
if (this.first < other.getFirst()) {
return true;
} else if (this.first == other.getFirst() && this.second < other.getSecond()) {
return true;
}
return false;
}
//重新小于等于方法
@Override
public boolean $less$eq(SecondarySortKey other) {
if (this.$less(other)) {
return true;
} else if (this.first == other.first && this.second == other.getSecond()) {
return true;
}
return false;
}
@Override
public int compare(SecondarySortKey other) {
if (this.first - other.getFirst() != 0) {
return this.first - other.getFirst();
} else {
return this.second - other.getSecond();
}
}
@Override
public int compareTo(SecondarySortKey other) {
if (this.first - other.getFirst() != 0) {
return this.first - other.getFirst();
} else {
return this.second - other.getSecond();
}
}
//为要进行排序的多个列,提供getter和setter方法 以及hashcode和equals方法
public int getFirst() {
return first;
}
public int getSecond() {
return second;
}
public void setFirst(int first) {
this.first = first;
}
public void setSecond(int second) {
this.second = second;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SecondarySortKey that = (SecondarySortKey) o;
return first == that.first &&
second == that.second;
}
@Override
public int hashCode() {
return Objects.hash(first, second);
}
}
核心代码:
package com.netcloud.spark.sparkcore.projectpractice;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
/**
* Demo_001_SparkSecondarySort
* spark java版本的二次排序。
* 1) 实现自定义的Key,要实现Ordered接口和Serializable接口,在key中实现自己对多个列的排序算法。
* 2) 将包含文本的RDD,映射成Key为自定义的key、value为文本的RDD。
* 3) 使用SortByKey算子 按照自定义的Key进行排序。
* 4) 再次映射,剔除自定义的key,只保留文本行。
* @author yangshaojun
* #date 2019/3/14 20:53
* @version 1.0
*/
public class Demo_001_SparkSecondarySort {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("Demo_001_SparkSecondarySort");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lineRDD = sc.textFile("data/sparkcore/secondarysort.txt");
//将JavaRDD<String> 转为 JavaPairRDD<SecondarySortKey,String> 其中这里的Key是我们自己定义的Key
JavaPairRDD<SecondarySortKey, String> pairRDD = lineRDD.mapToPair(new PairFunction<String, SecondarySortKey, String>() {
@Override
public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
String[] lineSplited = line.split(" ");
SecondarySortKey key = new SecondarySortKey(Integer.valueOf(lineSplited[0]), Integer.valueOf(lineSplited[1]));
return new Tuple2<SecondarySortKey, String>(key, line);
}
});
//调用 sortByKey 方法 然后按照自定义的key进行排序
JavaPairRDD<SecondarySortKey, String> sortByKey = pairRDD.sortByKey();
JavaRDD<String> retRDD = sortByKey.map(new Function<Tuple2<SecondarySortKey, String>, String>() {
@Override
public String call(Tuple2<SecondarySortKey, String> v1) throws Exception {
return v1._2;
}
});
retRDD.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
}
}
scala代码:
自定义key:
package com.netcloud.bigdata.spark_core.basiclearning.projectpractice
/**
* 自定义二次排序的key
* @author yangshaojun
* #date 2019/3/14 22:17
* @version 1.0
*/
case class SecondarySortKey(first:Int,second:Int) extends Ordered[SecondarySortKey] with Serializable{
override def compare(that: SecondarySortKey): Int = {
if(this.first - that.first !=0){
this.first-that.first
}else{
this.second-that.second
}
}
}
核心代码:
package com.netcloud.bigdata.spark_core.basiclearning.projectpractice
import org.apache.spark.{SparkConf, SparkContext}
/**
* spark scala版本的二次排序。
* 1)自定义key 要实现Ordered接口和Serializable接口,在key中实现自己对多个列的排序算法。
* 2) 将包含文本的RDD,映射成Key为自定义的key、value为文本的RDD。
* 3) 使用SortByKey算子 按照自定义的Key进行排序。
* 4) 再次映射,剔除自定义的key,只保留文本行。
* @author yangshaojun
* #date 2019/3/14 8:58
* @version 1.0
*/
object Demo_001_SparkSecondarySort {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("Demo_001_SparkSecondarySort")
val sc = new SparkContext(conf)
val lineRDD = sc.textFile("data/sparkcore/secondarysort.txt")
val kvRDD = lineRDD.map(line => (SecondarySortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt), line))
val sortRDD = kvRDD.sortByKey()
val retRDD = sortRDD.map(kv => kv._2)
retRDD.foreach(println)
}
}