数据源
# f://data.txt
# className, studentName, score
c1 a 85
c2 b 77
c3 c 88
c1 d 22
c1 e 66
c3 f 95
c3 g 54
c2 h 91
c2 i 66
c1 j 54
c1 k 65
c2 l 41
c4 m 65
目标
对班级进行分组,对成绩求topN
解决方法
Spark Core
val classStudentScore = spark.sparkContext.textFile("f://data.txt")
.map(x=>{
val splits = x.split("\t")
(splits(0), (splits(1), splits(2).toInt))
})
.groupByKey()
.map(x=>{
val className = x._1
val nameScores = x._2.toArray.sortBy(_._2).reverse.take(3)
(className, nameScores)
})
.foreach(x=>{
println(x._1)
x._2.foreach(println)
})
Spark SQL
val classStudentScoreRDD = spark.read.text("f://data.txt").rdd
.map(x => {
val splits = x.getString(0).split("\t")
Row(splits(0), splits(1), splits(2).toInt)
})
val classStudentScoreDF = spark.createDataFrame(classStudentScoreRDD, new StructType(
Array(
StructField("className", StringType, true),
StructField("studentName", StringType, true),
StructField("score", IntegerType, true)
)
))
classStudentScoreDF
.withColumn("rank",
row_number()
over
partitionBy(col("className"))
.orderBy(col("score").desc))
.filter(col("rank").<=(3))
.show()