数据是:
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object TopN {
def main(args: Array[String]): Unit = {
sparkcoreTopN()
sparksqlTopN()
}
def sparkcoreTopN(): Unit ={
val spark: SparkSession = SparkSession.builder().appName("topn").master("local[*]").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
val rddtext: RDD[String] = sc.textFile("input/topn")
/**
* spark core 实现分组取topN
*/
val value: RDD[(String, (String, String))] = rddtext.map(x => {
val data: Array[String] = x.split(" ")
(data(0), (data(1), data(2)))
})
val value1: RDD[(String, Iterable[(String, String)])] = value.groupByKey()
value1.foreach(x=>println(x))
val value2: RDD[(String, Array[(String, String)])] = value1.map(m => {
val classname: String = m._1
val top3: Array[(String, String)] = m._2.toArray.sortWith(_._2 > _._2).take(3)
(classname, top3)
})
value2.foreach(m=>{
println(m._1+"班级前三")
m._2.foreach(println)
})
}
//用sparksql (开窗函数)完成top3
def sparksqlTopN (): Unit ={
val spark: SparkSession = SparkSession.builder().appName("topn").master("local[*]").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
import spark.implicits._
val rddtext: RDD[String] = sc.textFile("input/topn")
val student: DataFrame = rddtext.map(x => x.split(" ")).map(data => (data(0), data(1), data(2).toInt)).toDF("classname", "name", "age")
student.createOrReplaceTempView("student")
spark.sql(" select * from (select *, Row_Number() OVER (partition by classname ORDER BY age desc) topn FROM student) t1 where topn<=3").show()
}
}
使用开窗函数的结果