Spark核心编程：高级编程之二次排序

最新推荐文章于 2024-08-20 00:22:50 发布

VectorYang

最新推荐文章于 2024-08-20 00:22:50 发布

阅读量690

点赞数 2

分类专栏： spark 文章标签： spark java 编程

本文链接：https://blog.csdn.net/VectorYang/article/details/74012012

版权

spark 专栏收录该内容

19 篇文章 0 订阅

订阅专栏

1.案例：

1>按照文件中的第一列排序。
2>如果第一列相同，则按照第二列排序。
3>文件部分数据：
这里写图片描述
4>代码：
Java版：
1）自定义二次排序类：

package cn.spark.study.core;

import java.io.Serializable;

import scala.math.Ordered;

/*
 * 自定义的二次排序
 */
public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable{

    private static final long serialVersionUID = 1L;
    //首先在自定义key里面，定义需要进行排序的列
    private int first;
    private int second;


    public SecondarySortKey(int first, int second) {
        super();
        this.first = first;
        this.second = second;
    }

    @Override
    public boolean $greater(SecondarySortKey other) {
        if(this.first > other.getFirst()){
            return true;
        } else if(this.first == other.getFirst() &&
                this.second > other.getSecond()){
            return true;
        }
        return false;
    }

    @Override
    public boolean $greater$eq(SecondarySortKey other) {
        if(this.$greater(other)){
            return true;
        }else if (this.first == other.getFirst() &&
                this.second == other.getSecond()) {
            return true;
        }
        return false;
    }

    @Override
    public boolean $less(SecondarySortKey other) {
        if(this.first < other.getFirst()){
            return true;
        } else if(this.first == other.getFirst() &&
                this.second < other.getSecond()){
            return true;
        }
        return false;
    }

    @Override
    public boolean $less$eq(SecondarySortKey other) {
        if(this.$less(other)){
            return true;
        }else if(this.first == other.getFirst() &&
                this.second == other.getSecond()){
            return true;
        }
        return false;
    }

    @Override
    public int compare(SecondarySortKey other) {
        if(this.first - other.getFirst() != 0){
            return this.first - other.getFirst();
        }else{
            return this.second - other.getSecond();
        }
    }

    @Override
    public int compareTo(SecondarySortKey other) {
        if(this.first - other.getFirst() != 0){
            return this.first - other.getFirst();
        } else {
            return this.second - other.getSecond();
        }
    }
    //为要进行排序的多个列，提供getter和setter方法，以及hashcode和equal是方法
    public int getFirst() {
        return first;
    }

    public void setFirst(int first) {
        this.first = first;
    }

    public int getSecond() {
        return second;
    }

    public void setSecond(int second) {
        this.second = second;
    }

    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + first;
        result = prime * result + second;
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        SecondarySortKey other = (SecondarySortKey) obj;
        if (first != other.first)
            return false;
        if (second != other.second)
            return false;
        return true;
    }


}

2）案例实现类：

package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;


/*
 * 二次排序
 * 1，实现自定义的key,要实现Ordered接口和Serializable接口，在key中实现自己对多列的排序算法
 * 2.将包含文本的RDD，映射成key为自定义key，value为文本的JavaPairRDD
 * 3.使用sortByKey算子按照自定义的key进行排序
 * 4.再次映射，剔除自定义的key，只保留文本行
 */
public class SecondarySort {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("SecondarySort")
                .setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String>lines = sc.textFile("G://SparkDevel//test//wordCount//data//sort.txt");
        JavaPairRDD<SecondarySortKey, String> pairs = lines.mapToPair(
                new PairFunction<String, SecondarySortKey, String>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
                        String[] lineSplited = line.split(" ");
                        SecondarySortKey key = new SecondarySortKey(
                                Integer.valueOf(lineSplited[0]),
                                Integer.valueOf(lineSplited[1]));
                        return new Tuple2<SecondarySortKey, String>(key, line);
                    }
        });
        JavaPairRDD<SecondarySortKey, String> sortedPairs = pairs.sortByKey();
        JavaRDD<String> sortedLines = sortedPairs.map(new Function<Tuple2<SecondarySortKey,String>, String>() {

            private static final long serialVersionUID = 1L;

            @Override
            public String call(Tuple2<SecondarySortKey, String> v1) throws Exception {

                return v1._2;
            }
        });
        sortedLines.foreach(new VoidFunction<String>() {
            private static final long serialVersionUID = 1L;

            @Override
            public void call(String v1) throws Exception {
                System.out.println(v1);

            }
        });
        sc.close();
    }
}

Scala版：
1）自定义二次排序类：

package cn.spark.study.core

/**
 * @author Administrator
 */
class SecondSortKey(val first: Int, val second: Int) 
    extends Ordered[SecondSortKey] with Serializable {

  def compare(that: SecondSortKey): Int = {
    if(this.first - that.first != 0) {
      this.first - that.first
    } else {
      this.second - that.second
    }
  }

}

2）案例实现类：

package cn.spark.study.core

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

/**
 * @author Administrator
 */
object SecondSort {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
        .setAppName("SecondSort")  
        .setMaster("local")  
    val sc = new SparkContext(conf)

    val lines = sc.textFile("G://SparkDevel//test//wordCount//data//sort.txt", 1)
    val pairs = lines.map { line => (
        new SecondSortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt),
        line)}
    val sortedPairs = pairs.sortByKey()
    val sortedLines = sortedPairs.map(sortedPair => sortedPair._2)  

    sortedLines.foreach { sortedLine => println(sortedLine) }  
  }

}