spark二次排序示例代码

最新推荐文章于 2023-11-26 00:24:43 发布

小飞猪666

最新推荐文章于 2023-11-26 00:24:43 发布

阅读量201

点赞数

分类专栏： spark 机器学习 Scala java

本文链接：https://blog.csdn.net/yangshaojun1992/article/details/88563547

版权

机器学习同时被 3 个专栏收录

34 篇文章 4 订阅

订阅专栏

spark

32 篇文章 1 订阅

订阅专栏

Scala

22 篇文章 0 订阅

订阅专栏

java的代码：

自定义key

package com.netcloud.spark.sparkcore.projectpractice;

import scala.math.Ordered;

import java.io.Serializable;
import java.util.Objects;

/**
 * 自定义的二次排序Key
 * 1）实现Ordered 、Serializable接口
 * @author yangshaojun
 * #date  2019/3/14 20:54
 * @version 1.0
 */
public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable {


    //在自定义key里面，定义需要进行排序的列
    private int first;
    private int second;

    public SecondarySortKey(int first, int second) {
        this.first = first;
        this.second = second;
    }

    //重新大于方法
    @Override
    public boolean $greater(SecondarySortKey other) {
        if (this.first > other.getFirst()) {
            return true;
        } else if (this.first == other.getFirst() && this.second > other.getSecond()) {
            return true;
        }
        return false;
    }

    //重写大于等于方法
    @Override
    public boolean $greater$eq(SecondarySortKey other) {
        if (this.$greater(other)) {
            return true;
        } else if (this.first == other.first && this.second == other.getSecond()) {
            return true;
        }
        return false;
    }

    //重新小于的方法
    @Override
    public boolean $less(SecondarySortKey other) {
        if (this.first < other.getFirst()) {
            return true;
        } else if (this.first == other.getFirst() && this.second < other.getSecond()) {
            return true;
        }
        return false;
    }

    //重新小于等于方法
    @Override
    public boolean $less$eq(SecondarySortKey other) {

        if (this.$less(other)) {
            return true;
        } else if (this.first == other.first && this.second == other.getSecond()) {
            return true;
        }
        return false;
    }

    @Override
    public int compare(SecondarySortKey other) {
        if (this.first - other.getFirst() != 0) {
            return this.first - other.getFirst();
        } else {
            return this.second - other.getSecond();
        }
    }

    @Override
    public int compareTo(SecondarySortKey other) {
        if (this.first - other.getFirst() != 0) {
            return this.first - other.getFirst();
        } else {
            return this.second - other.getSecond();
        }
    }

    //为要进行排序的多个列，提供getter和setter方法 以及hashcode和equals方法
    public int getFirst() {
        return first;
    }

    public int getSecond() {
        return second;
    }

    public void setFirst(int first) {
        this.first = first;
    }

    public void setSecond(int second) {
        this.second = second;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        SecondarySortKey that = (SecondarySortKey) o;
        return first == that.first &&
                second == that.second;
    }

    @Override
    public int hashCode() {

        return Objects.hash(first, second);
    }
}

核心代码：

package com.netcloud.spark.sparkcore.projectpractice;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

/**
 * Demo_001_SparkSecondarySort
 * spark java版本的二次排序。
 * 1) 实现自定义的Key，要实现Ordered接口和Serializable接口，在key中实现自己对多个列的排序算法。
 * 2) 将包含文本的RDD，映射成Key为自定义的key、value为文本的RDD。
 * 3) 使用SortByKey算子 按照自定义的Key进行排序。
 * 4) 再次映射，剔除自定义的key，只保留文本行。
 * @author yangshaojun
 * #date  2019/3/14 20:53
 * @version 1.0
 */
public class Demo_001_SparkSecondarySort {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setMaster("local").setAppName("Demo_001_SparkSecondarySort");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<String> lineRDD = sc.textFile("data/sparkcore/secondarysort.txt");

        //将JavaRDD<String> 转为 JavaPairRDD<SecondarySortKey,String>  其中这里的Key是我们自己定义的Key
        JavaPairRDD<SecondarySortKey, String> pairRDD = lineRDD.mapToPair(new PairFunction<String, SecondarySortKey, String>() {
            @Override
            public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
                String[] lineSplited = line.split(" ");
                SecondarySortKey key = new SecondarySortKey(Integer.valueOf(lineSplited[0]), Integer.valueOf(lineSplited[1]));
                return new Tuple2<SecondarySortKey, String>(key, line);
            }
        });
        //调用 sortByKey 方法 然后按照自定义的key进行排序
        JavaPairRDD<SecondarySortKey, String> sortByKey = pairRDD.sortByKey();
        JavaRDD<String> retRDD = sortByKey.map(new Function<Tuple2<SecondarySortKey, String>, String>() {

            @Override
            public String call(Tuple2<SecondarySortKey, String> v1) throws Exception {
                return v1._2;
            }
        });
        retRDD.foreach(new VoidFunction<String>() {
            @Override
            public void call(String s) throws Exception {
                System.out.println(s);
            }
        });
    }
}

scala代码：

自定义key：

package com.netcloud.bigdata.spark_core.basiclearning.projectpractice

/**
  * 自定义二次排序的key
  * @author yangshaojun
  * #date  2019/3/14 22:17
  * @version 1.0
  */
case class SecondarySortKey(first:Int,second:Int) extends Ordered[SecondarySortKey] with Serializable{
  override def compare(that: SecondarySortKey): Int = {
    if(this.first - that.first !=0){
      this.first-that.first
    }else{
      this.second-that.second
    }
  }
}

核心代码：

package com.netcloud.bigdata.spark_core.basiclearning.projectpractice

import org.apache.spark.{SparkConf, SparkContext}

/**
  * spark scala版本的二次排序。
  * 1)自定义key 要实现Ordered接口和Serializable接口，在key中实现自己对多个列的排序算法。
  * 2) 将包含文本的RDD，映射成Key为自定义的key、value为文本的RDD。
  * 3) 使用SortByKey算子 按照自定义的Key进行排序。
  * 4) 再次映射，剔除自定义的key，只保留文本行。
  * @author yangshaojun
  * #date  2019/3/14 8:58
  * @version 1.0
  */
object Demo_001_SparkSecondarySort {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("Demo_001_SparkSecondarySort")
    val sc = new SparkContext(conf)
    val lineRDD = sc.textFile("data/sparkcore/secondarysort.txt")

    val kvRDD = lineRDD.map(line => (SecondarySortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt), line))
    val sortRDD = kvRDD.sortByKey()
    val retRDD = sortRDD.map(kv => kv._2)
    retRDD.foreach(println)
  }

}