over()开窗函数其实就是根据指定字段分组再根据指定字段排序;
开窗函数也是自定义函数的一种,传入多个参数,传出多个参数;
package com
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
object demo101 {
case class StudentScore(name:String,clazz:Int,score:Int)
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName(this.getClass.getName)
val sc = new SparkContext(conf)
val session = SparkSession.builder().config(conf).getOrCreate()
val tuples: Array[(String, Int, Int)] = Array(
("a", 1, 88),
("b", 1, 78),
("c", 1, 95),
("d", 2, 74),
("e", 2, 92),
("f", 3, 99),
("g", 3, 99),
("h", 3, 45),
("i", 3, 53),
("j", 3, 78)
)
tuples
import session.implicits._
val ssDT: Dataset[StudentScore] = sc.makeRDD(tuples).map(a => StudentScore(a._1,a._2,a._3)).toDS()
// 创建临时表
ssDT.createOrReplaceTempView("t_table")
// session.sql("select * from t_table").show()
// 跳跃排序,两个第一名,后跟第三名
// session.sql("select name,clazz,score,rank() over(partition by clazz order by score desc) rownum from t_table").show()
// 连续排序,两个第一名,后跟第二名
// session.sql("select name,clazz,score,dense_rank() over(partition by clazz order by score desc) rownum from t_table").show()
// 连续排序,不存在并列第一名
// session.sql("select name,clazz,score,row_number() over(partition by clazz order by score desc) rownum from t_table").show()
// 取前三名
// session.sql("select * from (select name,clazz,score,rank() over(partition by clazz order by score desc) rownum from t_table) t1 where rownum <= 3").show()
// session.sql("select * from (select name,clazz,score,dense_rank() over(partition by clazz order by score desc) rownum from t_table) t1 where rownum <= 3").show()
// session.sql("select * from (select name,clazz,score,row_number() over(partition by clazz order by score desc) rownum from t_table) t1 where rownum <= 3").show()
}
}