Spark实现排序

最新推荐文章于 2022-11-23 02:05:17 发布

曼路

最新推荐文章于 2022-11-23 02:05:17 发布

阅读量1.9k

点赞数

分类专栏： hadoop

本文链接：https://blog.csdn.net/qq_32539825/article/details/82873606

版权

hadoop 专栏收录该内容

24 篇文章 0 订阅

订阅专栏

question：用spark对数据进行排序，首先按照颜值的从高到低进行排序，如果颜值相等，在根据年龄的升序排序

1.User类继承ordered，并且序列化

package cn.edu360.spark.day06

import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
  * 自定义排序
  * Created by zhangjingcun on 2018/9/27 17:13.
  */
object CustomSort1 {

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.OFF)

    val conf = new SparkConf().setAppName("IPLocation").setMaster("local[*]")
    val sc = new SparkContext(conf)

    //用spark对数据进行排序，首先按照颜值的从高到低进行排序，如果颜值相等，在根据年龄的升序排序
    val users: Array[String] = Array("1,tom,99,34", "2,marry,96,26", "3,mike,98,29", "4,jim,96,30")

    //并行化成RDD
    val userLines: RDD[String] = sc.makeRDD(users)

    //整理数据
    val userRdd: RDD[User1] = userLines.map(line => {
      val fileds = line.split(",")
      val id = fileds(0).toLong
      val name = fileds(1)
      val fv = fileds(2).toInt
      val age = fileds(3).toInt
      new User1(id, name, fv, age)
    })

    //排序
    val sorted: RDD[User1] = userRdd.sortBy(u => u)

    //收集数据
    val result: Array[User1] = sorted.collect()

    println(result.toBuffer)

    sc.stop()
  }
}

class User1 (val id:Long, val name:String, val fv:Int, val age:Int) extends Ordered[User1] with Serializable {
  override def compare(that: User1): Int = {
    //颜值相等的时候
    if (that.fv == this.fv) {
      this.age - that.age
    } else {
      -(this.fv - that.fv)
    }
  }

  override def toString: String = {
    s"User:($id, $name, $fv, $age)"
  }
}

2.User继承Sorted 没有序列化，不需要new

package cn.edu360.spark.day06

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

/**
  * Created by zhangjingcun on 2018/9/27 17:32.
  */
object CustomSort2 {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.OFF)

    val conf = new SparkConf().setAppName("IPLocation").setMaster("local[*]")
    val sc = new SparkContext(conf)

    //用spark对数据进行排序，首先按照颜值的从高到低进行排序，如果颜值相等，在根据年龄的升序排序
    val users: Array[String] = Array("1,tom,99,34", "2,marry,96,26", "3,mike,98,29", "4,jim,96,30")

    //并行化成RDD
    val userLines: RDD[String] = sc.makeRDD(users)

    //整理数据
    val userRdd: RDD[User2] = userLines.map(line => {
      val fileds = line.split(",")
      val id = fileds(0).toLong
      val name = fileds(1)
      val fv = fileds(2).toInt
      val age = fileds(3).toInt
      User2(id, name, fv, age)
    })

    //排序
    val sorted: RDD[User2] = userRdd.sortBy(u => u)

    //收集数据
    val result: Array[User2] = sorted.collect()

    println(result.toBuffer)

    sc.stop()
  }
}
//case 可以不使用new关键字
//不需要实现序列化
case class User2 (val id:Long, val name:String, val fv:Int, val age:Int) extends Ordered[User2] {
  override def compare(that: User2): Int = {
    //颜值相等的时候
    if (that.fv == this.fv) {
      this.age - that.age
    } else {
      -(this.fv - that.fv)
    }
  }

  override def toString: String = {
    s"User:($id, $name, $fv, $age)"
  }
}

package cn.edu360.spark.day06

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

/**
  * Created by zhangjingcun on 2018/9/27 17:37.
  */
object CustomSort3 {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.OFF)

    val conf = new SparkConf().setAppName("IPLocation").setMaster("local[*]")
    val sc = new SparkContext(conf)

    //用spark对数据进行排序，首先按照颜值的从高到低进行排序，如果颜值相等，在根据年龄的升序排序
    val users: Array[String] = Array("1,tom,99,34", "2,marry,96,26", "3,mike,98,29", "4,jim,96,30")

    //并行化成RDD
    val userLines: RDD[String] = sc.makeRDD(users)

    //整理数据
    val userRdd: RDD[(Long, String, Int, Int)] = userLines.map(line => {
      val fileds = line.split(",")
      val id = fileds(0).toLong
      val name = fileds(1)
      val fv = fileds(2).toInt
      val age = fileds(3).toInt
      (id, name, fv, age)
    })

    //排序
    val sorted: RDD[(Long, String, Int, Int)] = userRdd.sortBy(tp => User3(tp._1, tp._2, tp._3, tp._4))

    //收集数据
    val result: Array[(Long, String, Int, Int)] = sorted.collect()

    println(result.toBuffer)

    sc.stop()
  }
}
//case 可以不使用new关键字
//不需要实现序列化
case class User3 (val id:Long, val name:String, val fv:Int, val age:Int) extends Ordered[User3] {
  override def compare(that: User3): Int = {
    //颜值相等的时候
    if (that.fv == this.fv) {
      this.age - that.age
    } else {
      -(this.fv - that.fv)
    }
  }

  override def toString: String = {
    s"User:($id, $name, $fv, $age)"
  }
}

package cn.edu360.spark.day06

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

/**
  * Created by zhangjingcun on 2018/9/27 17:41.
  */
object CustomSort4 {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.OFF)

    val conf = new SparkConf().setAppName("IPLocation").setMaster("local[*]")
    val sc = new SparkContext(conf)

    //用spark对数据进行排序，首先按照颜值的从高到低进行排序，如果颜值相等，在根据年龄的升序排序
    val users: Array[String] = Array("1,tom,99,34", "2,marry,96,26", "3,mike,98,29", "4,jim,96,30")

    //并行化成RDD
    val userLines: RDD[String] = sc.makeRDD(users)

    //整理数据
    val tpRdd: RDD[(Long, String, Int, Int)] = userLines.map(line => {
      val fileds = line.split(",")
      val id = fileds(0).toLong
      val name = fileds(1)
      val fv = fileds(2).toInt
      val age = fileds(3).toInt
      (id, name, fv, age)
    })

    //利用元祖的比较特点：先比较第一个，如果不相等，按照第一个属性排序，在比较下个属性
    implicit val rules = Ordering[(Int, Int)].on[(Long, String, Int, Int)](t => (-t._3, t._4))

    val sorted = tpRdd.sortBy(t => t)
    //收集数据
    val result: Array[(Long, String, Int, Int)] = sorted.collect()

    println(result.toBuffer)

    sc.stop()
  }
}