需求
- 按照文件中的第一列排序。
- 如果第一列相同,则按照第二列排序。
文件内容
2 5
3 6
2 4
1 3
1 5
Java实现
自定义的二次排序key
/**
* 自定义的二次排序key
*/
public class SecondarySortKey implements Ordered<SecondarySortKey>,Serializable {
// 首先在自定义key里面,定义需要进行排序的列
private int first;
private int second;
public SecondarySortKey(int first, int second) {
this.first = first;
this.second = second;
}
@Override
public int compare(SecondarySortKey that) {
if(this.first - that.first != 0) {
return this.first - that.first;
}else {
return this.second - that.second;
}
}
@Override
public boolean $less(SecondarySortKey that) {
if(this.first < that.first) {
return true;
}else if(this.first == that.first && this.second < that.second){
return true;
}
return false;
}
@Override
public boolean $greater(SecondarySortKey that) {
if(this.first > that.first) {
return true;
}else if(this.first == that.first && this.second > that.second){
return true;
}
return false;
}
@Override
public boolean $less$eq(SecondarySortKey that) {
if($less(that)){
return true;
}else if(this.first == that.first && this.second == that.second) {
return true;
}
return false;
}
@Override
public boolean $greater$eq(SecondarySortKey that) {
if($greater(that)) {
return true;
}else if(this.first == that.first && this.second == that.second) {
return true;
}
return false;
}
@Override
public int compareTo(SecondarySortKey that) {
if(this.first - that.first != 0) {
return this.first - that.first;
}else {
return this.second - that.second;
}
}
// 为要进行排序的多个列,提供getter和setter方法,以及hashcode和equals方法
public int getFirst() {
return first;
}
public void setFirst(int first) {
this.first = first;
}
public int getSecond() {
return second;
}
public void setSecond(int second) {
this.second = second;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
SecondarySortKey that = (SecondarySortKey) o;
return first == that.first &&
second == that.second;
}
@Override
public int hashCode() {
return Objects.hash(first, second);
}
}
二次排序
/**
* 二次排序
* 1、实现自定义的key,要实现Ordered接口和Serializable接口,在key中实现自己对多个列的排序算法
* 2、将包含文本的RDD,映射成key为自定义key,value为文本的JavaPairRDD
* 3、使用sortByKey算子按照自定义的key进行排序
* 4、再次映射,剔除自定义的key,只保留文本行
*
*/
public class SecondarySort {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("SecondarySortJava").setMaster("local");
JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
JavaRDD<String> numsRDD = sparkContext.textFile("E:\\testdata\\wordcount\\input\\sort.txt");
JavaPairRDD<SecondarySortKey, String> pairs = numsRDD.mapToPair(new PairFunction<String, SecondarySortKey, String>() {
@Override
public Tuple2<SecondarySortKey, String> call(String s) throws Exception {
return new Tuple2<>(new SecondarySortKey(Integer.parseInt(s.split(" ")[0]),
Integer.parseInt(s.split(" ")[1])), s);
}
});
JavaPairRDD<SecondarySortKey, String> sortedPairs = pairs.sortByKey();
JavaRDD<String> result = sortedPairs.map(new Function<Tuple2<SecondarySortKey, String>, String>() {
@Override
public String call(Tuple2<SecondarySortKey, String> secondarySortKeyStringTuple2) throws Exception {
return secondarySortKeyStringTuple2._2;
}
});
result.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println("s = " + s);
}
});
sparkContext.close();
}
}
Scala实现
SecondarySortKey
class SecondarySortKey(val first:Int, val second:Int) extends Ordered[SecondarySortKey] with Serializable {
override def compare(that: SecondarySortKey): Int = {
if(this.first - that.first != 0) {
this.first - that.first
}else {
this.second - that.second
}
}
}
SecondarySort
object SecondarySort {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SecondarySortScala").setMaster("local")
val sparkContext = new SparkContext(conf)
val linesRDD = sparkContext.textFile("E:\\testdata\\wordcount\\input\\sort.txt")
val keyLineRDD = linesRDD.map(line => (new SecondarySortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt),line))
val sortedKeyLine = keyLineRDD.sortByKey()
val result = sortedKeyLine.map(keyline => keyline._2)
result.foreach(result => println(result))
}
}