import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
objectTestSqlGroupByOrder {def main(args: Array[String]): Unit = {
/**设置日志等级*/
Logger.getLogger("org").setLevel(Level.WARN)
/**从Spark 2.0开始,引入SparkSession。SparkSession=SQLContext+HiveContext*/val sparkSession=SparkSession.builder().appName("SparkSqlGroup").master("local[6]").getOrCreate()
/**DataFrame*/import sparkSession.implicits._
val scoreInfo = sparkSession.read.textFile("/Users/wangpei/Desktop/scores2.txt").map(_.split(",")).map(item=>(item(1),item(2).toInt,item(3).toInt,item(4).toInt,item(5),item(6)))
.toDF("studentId","language","math","english","classId","departmentId")
/**注册DataFrame成一个零时视图*/
scoreInfo.createOrReplaceTempView("scoresTable")
/**
* 使用开窗函数
* row_number() OVER (PARTITION BY COL1 ORDER BY COL2) rank
* 根据COL1分组,在分组内部根据COL2排序,rank:每组内部排序后的编号字段
* 这里用了两段SQl:
* 1)(SELECT *, row_number() OVER (PARTITION BY departmentId,classId ORDER BY math DESC) rank FROM scoresTable ) tmp
* 用开窗函数:按departmentId,classId分组;分组内部按math降序;每组序号rank从1开始;表别名tmp
* 2)SELECT * FROM tmp WHERE rank <= 3
* 保留rank <= 3的数据
*///语文前3
println("############# 语文前3 ##############")
sparkSession.sql("SELECT departmentId,classId,language,studentId FROM (SELECT *, row_number() OVER (PARTITION BY departmentId,classId ORDER BY language DESC) rank FROM scoresTable ) tmp WHERE rank <= 3").show()
//数学前3
println("############# 数学前3 ##############")
sparkSession.sql("SELECT departmentId,classId,math,studentId FROM (SELECT *, row_number() OVER (PARTITION BY departmentId,classId ORDER BY math DESC) rank FROM scoresTable ) tmp WHERE rank <= 3").show()
//外语前3
println("############# 外语前3 ##############")
sparkSession.sql("SELECT departmentId,classId,english,studentId FROM (SELECT *, row_number() OVER (PARTITION BY departmentId,classId ORDER BY english DESC) rank FROM scoresTable ) tmp WHERE rank <= 3").show()
}
}