Spark实现分组排序取topN
读取文件:
Chinese zhangsan 90
Chinese lisi 80
Chinese wangwu 70
Math zhangsan 91
Math lisi 90
Math wangwu 95
English zhangsan 81
English lisi 82
English wangwu 83
代码实现:
rdd实现和dataframe实现:
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Dataset, Row, SparkSession}
object GroupByTopN {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.OFF)
val spark: SparkSession = SparkSession.builder().master("local").getOrCreate()
val ds: Dataset[String] = spark.read.textFile("studentScore.txt")
//RDD实现
val rdd1: RDD[(String, (String, Int))] = ds.rdd.map(x => {
val strings = x.split(" ")
(strings(0), (strings(1), strings(2).toInt))
})
val rdd2: RDD[(String, Iterable[(String, Int)])] = rdd1.groupByKey()
val topItem_set = rdd2
.flatMap(line => {
val value: Iterable[(String, Int)] = line._2
//两种实现方式
val topItem: Array[(String, Int)] = value.toArray.sortBy(_._2)(Ordering[Int].reverse).take(2)
//val topItem: Array[(String, Int)] = value.toArray.sortBy(_._2).reverse.take(2)
topItem.map(x => {
(line._1, x)
})
})
import spark.implicits._
topItem_set.toDF().show()
//使用dataframe格式
//第一种方式
val df = ds.map(x => {
val strings = x.split(" ")
Stu(strings(0), strings(1), strings(2).toInt)
})
println(df.schema)
val res: Dataset[Row] = df.selectExpr("*", "row_number() over (partition by course order by score desc) as rank")
.where("rank<=2").selectExpr("course", "name", "score")
res.toDF().show()
//第二种方式
import org.apache.spark.sql.expressions.Window
val window = Window.partitionBy(col("course")).orderBy(col("score").desc)
val dfTop2 = df.withColumn("rank", row_number().over(window)).where("rank<=2")
dfTop2.show()
}
case class Stu(course: String, name: String, score: Int)
}